aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore3
-rw-r--r--.gitlab-ci.yml50
-rw-r--r--CONTRIBUTORS4
-rw-r--r--README.md54
-rw-r--r--RUNBOOK.md44
-rw-r--r--TODO57
-rw-r--r--blobs/README.md86
-rw-r--r--blobs/minio/README.md74
-rw-r--r--blobs/minio/minio.conf14
-rw-r--r--blobs/seaweedfs/README.md9
-rw-r--r--blobs/tasks.md53
-rw-r--r--cdx-record-pipeline/README.md33
-rwxr-xr-xcdx-record-pipeline/cdx-record-pipeline.py67
-rw-r--r--cdx-record-pipeline/requirements.txt3
-rw-r--r--extra/docker/README.md11
-rw-r--r--extra/docker/docker-compose.yml39
-rwxr-xr-xfetch_hadoop.sh38
-rw-r--r--hbase/howto.md42
-rw-r--r--hbase/notes.txt232
-rw-r--r--hbase/schema_design.md79
-rw-r--r--kafka/debugging_issues.txt48
-rw-r--r--kafka/grobid_kafka_notes.txt60
-rw-r--r--kafka/howto_rebalance.md43
-rw-r--r--kafka/topics.md214
-rw-r--r--mapreduce/.coveragerc3
-rw-r--r--mapreduce/Pipfile.lock1142
-rw-r--r--mapreduce/README.md74
-rw-r--r--mapreduce/TODO4
-rw-r--r--mapreduce/pytest.ini8
-rw-r--r--mapreduce/xml2json.py7
-rw-r--r--match_test_data/NOTES.txt13
-rw-r--r--match_test_data/RESULTS.txt30
-rw-r--r--match_test_data/crossref_sample.bibjson964
-rw-r--r--match_test_data/grobid_sample.bibjson979
-rw-r--r--match_test_data/math_universe_releases.json4
-rw-r--r--nginx/README.md18
-rw-r--r--nginx/fatcat-blobs51
-rw-r--r--nginx/sandcrawler-db80
-rw-r--r--nginx/sandcrawler-minio57
-rw-r--r--notes/backfill_scalding_rewrite.txt22
-rw-r--r--notes/crawl_cdx_merge.md29
-rw-r--r--notes/fuzzy_match_notes.md148
-rw-r--r--notes/grobid_munging.txt70
-rw-r--r--notes/hadoop_job_log.md210
-rw-r--r--notes/hbase_table_sizes.txt12
-rw-r--r--notes/html_ingest_notes.md318
-rw-r--r--notes/ingest/.gitignore2
-rw-r--r--notes/ingest/2019-10-23_testing.md8
-rw-r--r--notes/ingest/2020-01-14_bulk.md26
-rw-r--r--notes/ingest/2020-02-04_ingest_backfills.md148
-rw-r--r--notes/ingest/2020-02-18_ingest_backfills.md42
-rw-r--r--notes/ingest/2020-02-21_ingest_backfills.md104
-rw-r--r--notes/ingest/2020-02-22_fixed_domain.txt246
-rw-r--r--notes/ingest/2020-02_unpaywall.md624
-rw-r--r--notes/ingest/2020-03-02_ingests.txt174
-rw-r--r--notes/ingest/2020-03-oa_but_not_marked.md25
-rw-r--r--notes/ingest/2020-03_mag.md576
-rw-r--r--notes/ingest/2020-03_s2.md35
-rw-r--r--notes/ingest/2020-04-13_covid19.md73
-rw-r--r--notes/ingest/2020-04_datacite.md121
-rw-r--r--notes/ingest/2020-04_unpaywall.md312
-rw-r--r--notes/ingest/2020-05_oai_pmh.md428
-rw-r--r--notes/ingest/2020-05_pubmed.md10
-rw-r--r--notes/ingest/2020-07_mag.md353
-rw-r--r--notes/ingest/2020-08_daily_improvements.md202
-rw-r--r--notes/ingest/2020-09_oa_doi.md352
-rw-r--r--notes/ingest/2020-09_reingest.md197
-rw-r--r--notes/ingest/2020-09_scielo.md21
-rw-r--r--notes/ingest/2020-10_daily.md193
-rw-r--r--notes/ingest/2020-10_unpaywall.md286
-rw-r--r--notes/ingest/2020-11-04_arxiv.md12
-rw-r--r--notes/ingest/2020-11_doaj.md295
-rw-r--r--notes/ingest/2020-12-08_patch_crawl_notes.md111
-rw-r--r--notes/ingest/2021-04_unpaywall.md368
-rw-r--r--notes/ingest/2021-05_daily_improvements.md480
-rw-r--r--notes/ingest/2021-07_unpaywall.md320
-rw-r--r--notes/ingest/2021-08_mag.md400
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md1578
-rw-r--r--notes/ingest/2021-09-03_daily_improvements.md1021
-rw-r--r--notes/ingest/NEXT.md52
-rwxr-xr-xnotes/ingest/es_csv_to_json.py37
-rw-r--r--notes/library_shopping.txt10
-rw-r--r--notes/match_filter_enrich.txt31
-rw-r--r--notes/old_extract_results.txt50
-rw-r--r--notes/petabox_ia_metadata.txt56
-rw-r--r--notes/tasks/2020-01-06_heuristic_cdx.txt37
-rw-r--r--notes/tasks/2020-01-27_cleanup_cdx.md34
-rw-r--r--notes/tasks/2020-01-27_grobid_backfill.md40
-rw-r--r--notes/tasks/2020-02-14_pdftrio.md162
-rw-r--r--notes/tasks/2020-07-22_processing_holes.md120
-rw-r--r--notes/tasks/2020-08-20_file_meta.md66
-rw-r--r--notes/tasks/2020-10-21_pdfextract_holes.md74
-rw-r--r--notes/tasks/2021-09-09_pdf_url_lists.md66
-rw-r--r--notes/url_pattern_heuristic_backfill.txt104
-rw-r--r--notes/url_pattern_heuristic_verification.txt52
-rw-r--r--pig/README.md9
-rwxr-xr-xpig/fetch_deps.sh20
-rw-r--r--pig/filter-cdx-paper-pdfs.pig2
-rw-r--r--pig/filter-cdx-pdfs.pig24
-rw-r--r--pig/filter-cdx-ps.pig6
-rw-r--r--pig/filter-cdx-source-code-crude.pig40
-rw-r--r--pig/filter-cdx-tarball.pig38
-rw-r--r--pig/join-cdx-sha1.pig43
-rw-r--r--pig/tests/files/example.sha1b324
-rw-r--r--pig/tests/files/sourcecode.cdx6
-rw-r--r--pig/tests/files/tarballs.cdx10
-rw-r--r--pig/tests/pighelper.py5
-rw-r--r--pig/tests/test_filter_cdx_paper_pdfs.py4
-rw-r--r--pig/tests/test_filter_software.py16
-rw-r--r--pig/tests/test_join_cdx.py44
-rwxr-xr-xplease467
-rw-r--r--proposals/2019_ingest.md287
-rw-r--r--proposals/2019_pdftotext_pdfinfo.md123
-rw-r--r--proposals/20200129_pdf_ingest.md272
-rw-r--r--proposals/20200207_pdftrio.md104
-rw-r--r--proposals/20200211_nsq.md79
-rw-r--r--proposals/20201012_no_capture.md36
-rw-r--r--proposals/20201026_html_ingest.md129
-rw-r--r--proposals/20201103_xml_ingest.md81
-rw-r--r--proposals/2020_pdf_meta_thumbnails.md328
-rw-r--r--proposals/2020_seaweed_s3.md426
-rw-r--r--proposals/2021-04-22_crossref_db.md86
-rw-r--r--proposals/2021-09-09_component_ingest.md114
-rw-r--r--proposals/2021-09-13_src_ingest.md53
-rw-r--r--proposals/schema_changes.sql40
-rw-r--r--python/.coveragerc5
-rw-r--r--python/.flake86
-rw-r--r--python/.gitignore11
-rw-r--r--python/.pylintrc (renamed from mapreduce/.pylintrc)6
-rw-r--r--python/Makefile32
-rw-r--r--python/Pipfile65
-rw-r--r--python/Pipfile.lock1515
-rw-r--r--python/TODO7
-rw-r--r--python/example.env7
-rwxr-xr-xpython/grobid2json.py215
-rwxr-xr-xpython/grobid_tool.py149
-rwxr-xr-xpython/ia_pdf_match.py108
-rwxr-xr-xpython/ingest_file.py100
-rwxr-xr-xpython/pdfextract_tool.py139
-rwxr-xr-xpython/pdftrio_tool.py121
-rwxr-xr-xpython/persist_tool.py211
-rw-r--r--python/pytest.ini21
-rw-r--r--python/sandcrawler/__init__.py10
-rw-r--r--python/sandcrawler/db.py418
-rw-r--r--python/sandcrawler/grobid.py130
-rw-r--r--python/sandcrawler/html.py348
-rw-r--r--python/sandcrawler/html_ingest.py441
-rw-r--r--python/sandcrawler/html_metadata.py857
-rw-r--r--python/sandcrawler/ia.py1138
-rw-r--r--python/sandcrawler/ingest.py833
-rw-r--r--python/sandcrawler/minio.py99
-rw-r--r--python/sandcrawler/misc.py222
-rw-r--r--python/sandcrawler/pdfextract.py470
-rw-r--r--python/sandcrawler/pdftrio.py130
-rw-r--r--python/sandcrawler/persist.py584
-rw-r--r--python/sandcrawler/workers.py625
-rw-r--r--python/sandcrawler/xml.py7
-rwxr-xr-xpython/sandcrawler_worker.py380
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py72
-rwxr-xr-xpython/scripts/cdx_collection.py80
-rwxr-xr-xpython/scripts/covid2ingestrequest.py83
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py125
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py166
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py184
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py143
-rwxr-xr-xpython/scripts/enrich_scored_matches.py45
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py159
-rwxr-xr-xpython/scripts/filter_groupworks.py144
-rwxr-xr-xpython/scripts/filter_scored_matches.py116
-rwxr-xr-xpython/scripts/grobid_affiliations.py52
-rwxr-xr-xpython/scripts/import_grobid_metadata.py94
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py51
-rwxr-xr-xpython/scripts/manifest_converter.py56
-rwxr-xr-xpython/scripts/oai2ingestrequest.py137
-rwxr-xr-xpython/scripts/pdf_thumbnail.py35
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py111
-rw-r--r--python/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml (renamed from mapreduce/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml)0
-rw-r--r--python/tests/files/dlib_05vanhyning.html350
-rw-r--r--python/tests/files/dummy.pdfbin0 -> 13264 bytes
-rw-r--r--python/tests/files/dummy_zip.zipbin0 -> 37760 bytes
-rw-r--r--python/tests/files/elife_article.html3094
-rw-r--r--python/tests/files/example.cdx (renamed from mapreduce/tests/files/example.cdx)0
-rw-r--r--python/tests/files/example_grobid_metadata.json5
-rw-r--r--python/tests/files/first_monday_ojs3_fulltext.html441
-rw-r--r--python/tests/files/first_monday_ojs3_landingpage.html616
-rw-r--r--python/tests/files/genders_g58_fairlie.html146
-rw-r--r--python/tests/files/nature_article.html1379
-rw-r--r--python/tests/files/peerj_oa_article.html2365
-rw-r--r--python/tests/files/plos_one_article.html1707
-rw-r--r--python/tests/files/scielo_article.jats.xml336
-rw-r--r--python/tests/files/small.json52
-rw-r--r--python/tests/files/small.xml120
-rw-r--r--python/tests/test_grobid.py79
-rw-r--r--python/tests/test_grobid2json.py (renamed from mapreduce/tests/test_grobid2json.py)0
-rw-r--r--python/tests/test_html.py33
-rw-r--r--python/tests/test_html_ingest.py14
-rw-r--r--python/tests/test_html_metadata.py229
-rw-r--r--python/tests/test_ingest.py207
-rw-r--r--python/tests/test_live_wayback.py167
-rw-r--r--python/tests/test_misc.py77
-rw-r--r--python/tests/test_pdfextract.py68
-rw-r--r--python/tests/test_pushers.py28
-rw-r--r--python/tests/test_savepagenow.py204
-rw-r--r--python/tests/test_wayback.py172
-rw-r--r--python/tests/test_xml.py18
l---------python/title_slug_denylist.txt1
-rw-r--r--python_hadoop/Pipfile (renamed from mapreduce/Pipfile)3
-rw-r--r--python_hadoop/Pipfile.lock990
-rw-r--r--python_hadoop/README.md104
-rwxr-xr-xpython_hadoop/backfill_hbase_from_cdx.py (renamed from mapreduce/backfill_hbase_from_cdx.py)0
-rw-r--r--python_hadoop/common.py (renamed from mapreduce/common.py)26
-rwxr-xr-xpython_hadoop/extraction_cdx_grobid.py (renamed from mapreduce/extraction_cdx_grobid.py)57
-rwxr-xr-xpython_hadoop/extraction_ungrobided.py292
-rwxr-xr-xpython_hadoop/grobid2json.py (renamed from mapreduce/grobid2json.py)16
-rwxr-xr-xpython_hadoop/kafka_grobid_hbase.py200
-rw-r--r--python_hadoop/mrjob.conf (renamed from mapreduce/mrjob.conf)0
-rw-r--r--python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml2004
-rw-r--r--python_hadoop/tests/files/example.cdx20
-rw-r--r--python_hadoop/tests/files/example_grobid_metadata.json5
-rw-r--r--python_hadoop/tests/files/example_ungrobided.tsv20
-rw-r--r--python_hadoop/tests/files/small.json (renamed from mapreduce/tests/files/small.json)11
-rw-r--r--python_hadoop/tests/files/small.xml (renamed from mapreduce/tests/files/small.xml)0
-rw-r--r--python_hadoop/tests/test_backfill_hbase_from_cdx.py (renamed from mapreduce/tests/test_backfill_hbase_from_cdx.py)0
-rw-r--r--python_hadoop/tests/test_common.py (renamed from mapreduce/tests/test_common.py)0
-rw-r--r--python_hadoop/tests/test_extraction_cdx_grobid.py (renamed from mapreduce/tests/test_extraction_cdx_grobid.py)2
-rw-r--r--python_hadoop/tests/test_extraction_ungrobided.py178
-rw-r--r--python_hadoop/tests/test_grobid2json.py22
-rw-r--r--sandcrawler-rfc.md180
-rw-r--r--scalding/README.md36
-rw-r--r--scalding/build.sbt10
-rw-r--r--scalding/scalastyle-config.xml2
-rw-r--r--scalding/scalding-debugging.md10
-rw-r--r--scalding/src/main/resources/slug-denylist.txt554
-rw-r--r--scalding/src/main/scala/sandcrawler/BibjsonScorable.scala50
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala153
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala36
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala38
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala34
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala41
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala67
-rw-r--r--scalding/src/main/scala/sandcrawler/FatcatScorable.scala146
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala83
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala59
-rw-r--r--scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala43
-rw-r--r--scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala52
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala37
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala32
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala10
-rw-r--r--scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala30
-rw-r--r--scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala67
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala96
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala63
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreInsertable.scala86
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreJob.scala48
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala76
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala172
-rw-r--r--scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala72
-rw-r--r--scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala160
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala124
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala122
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala1
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala9
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala11
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala71
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala50
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala64
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala81
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala262
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala248
-rw-r--r--scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala85
-rw-r--r--sql/README.md160
-rw-r--r--sql/backfill/backfill.md135
-rwxr-xr-xsql/backfill/backfill_cdx.py132
-rwxr-xr-xsql/backfill/backfill_file_meta.py55
-rwxr-xr-xsql/backfill/backfill_grobid.py91
-rwxr-xr-xsql/backfill/backfill_grobid_unpaywall.py59
-rwxr-xr-xsql/backfill/filter_transform_cdx.py88
-rwxr-xr-xsql/backfill/petabox_transform.py24
-rw-r--r--sql/dump_file_meta.sql12
-rw-r--r--sql/dump_regrobid_pdf.sql15
-rw-r--r--sql/dump_regrobid_pdf_petabox.sql15
-rw-r--r--sql/dump_reingest_quarterly.sql31
-rw-r--r--sql/dump_reingest_spn.sql25
-rw-r--r--sql/dump_reingest_weekly.sql31
-rw-r--r--sql/dump_unextracted_pdf.sql22
-rw-r--r--sql/dump_unextracted_pdf_petabox.sql18
-rw-r--r--sql/dump_ungrobid_pdf.sql18
-rw-r--r--sql/dump_ungrobid_pdf_petabox.sql17
-rw-r--r--sql/dump_unmatched_glutton_pdf.sql19
-rw-r--r--sql/example.env1
-rw-r--r--sql/ingest_again.md158
-rw-r--r--sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt326
-rw-r--r--sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt307
-rw-r--r--sql/migrations/00000000000000_diesel_initial_setup/down.sql6
-rw-r--r--sql/migrations/00000000000000_diesel_initial_setup/up.sql36
-rw-r--r--sql/migrations/2019-12-19-060141_init/down.sql8
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql184
-rw-r--r--sql/monitoring_queries.md202
-rw-r--r--sql/pdftrio_queries.md65
-rw-r--r--sql/random_queries.md193
-rwxr-xr-xsql/reingest_quarterly.sh19
-rwxr-xr-xsql/reingest_spn.sh19
-rwxr-xr-xsql/reingest_weekly.sh19
l---------sql/sandcrawler_schema.sql1
-rw-r--r--sql/stats/2020-01-13_stats.txt190
-rw-r--r--sql/stats/2020-01-31_supplement.txt42
-rw-r--r--sql/stats/2020-02-24_stats.txt482
-rw-r--r--sql/stats/2020-05-03_stats.txt418
-rw-r--r--sql/stats/2020-07-23_stats.txt347
-rw-r--r--sql/stats/2020-09-14_stats.txt340
-rw-r--r--sql/stats/2021-04-07_stats.txt430
-rw-r--r--sql/stats/2021-04-08_table_sizes.txt40
-rw-r--r--sql/stats/README.md120
-rw-r--r--sql/table_sizes.md11
315 files changed, 52847 insertions, 1508 deletions
diff --git a/.gitignore b/.gitignore
index 28d3c9f..5723f96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,8 @@
mapreduce-*.tar.gz
*,cover
htmlcov/
-mapreduce/venv-current.tar.gz
+python/venv-current.tar.gz
+*.test
*.o
*.a
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3970bbb..3fbc709 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,31 +1,49 @@
-image: python:3.6-stretch
+
+image: ubuntu:focal
+
+variables:
+ LC_ALL: "C.UTF-8"
+ LANG: "C.UTF-8"
+ DEBIAN_FRONTEND: "noninteractive"
+
before_script:
- apt update -qy
- - apt install -y apt-transport-https
- - echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
- - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
- - apt update -qy
- - apt install -y python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt
- - pip3 install pipenv
+ - apt install -y --no-install-recommends apt-transport-https software-properties-common curl dirmngr gpg-agent
+ # scala-sbt.org APT signing key
+ - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0x2EE0EA64E40A89B84B2DF73499E82A75642AC823
+ - apt-add-repository -y "deb https://repo.scala-sbt.org/scalasbt/debian all main"
+ - apt install -y --no-install-recommends python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget
- pipenv --version
-test_python_mapreduce:
+test_python:
script:
- - cd mapreduce
+ - cd python
+ - pipenv install --dev --deploy
+ - make coverage
+ - make lint
+
+test_python_hadoop:
+ when: manual
+ script:
+ - cd python_hadoop
- pipenv install --dev --deploy
- pipenv run pytest --cov
- - pipenv run pylint --disable bad-continuation,arguments-differ,unidiomatic-typecheck *.py
+# needs fixing; some upstream com.hadoop.gplcompression#hadoop-lzo;0.4.16: java.lang.NullPointerException
+# change happened
test_scalding:
+ when: manual
script:
+ - ./please -h
- cd scalding
- sbt -mem 1024 test
- sbt -mem 1024 assembly
# Needs fixing
-#test_pig:
-# script:
-# - cd pig
-# - ./fetch_deps.sh
-# - pipenv install --dev --deploy
-# - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest
+test_pig:
+ when: manual
+ script:
+ - ./fetch_hadoop.sh
+ - cd pig
+ - pipenv install --dev --deploy
+ - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..f6dea1c
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,4 @@
+Bryan Newbold
+
+Ellen Spertus transfers copyright of all of her contributions to the
+repository in exchange for one Internet Archive Sticker, received. \ No newline at end of file
diff --git a/README.md b/README.md
index e53e775..afe1ff6 100644
--- a/README.md
+++ b/README.md
@@ -6,34 +6,50 @@
\ooooooo| |___/\__,_|_| |_|\__,_|\___|_| \__,_| \_/\_/ |_|\___|_|
-This repo contains hadoop tasks (mapreduce and pig), luigi jobs, and other
-scripts and code for the internet archive (web group) journal ingest pipeline.
+This repo contains back-end python workers, scripts, hadoop jobs, luigi tasks,
+and other scripts and code for the Internet Archive web group's journal ingest
+pipeline. This code is of mixed quality and is mostly experimental. The goal
+for most of this is to submit metadata to [fatcat](https://fatcat.wiki), which
+is the more stable, maintained, and public-facing service.
-This repository is potentially public.
+Code in this repository is potentially public! Not intented to accept public
+contributions for the most part. Much of this will not work outside the IA
+cluster environment.
Archive-specific deployment/production guides and ansible scripts at:
-[journal-infra](https://git.archive.org/bnewbold/journal-infra)
+[journal-infra](https://git.archive.org/webgroup/journal-infra)
-## Python Setup
-Pretty much everything here uses python/pipenv. To setup your environment for
-this, and python in general:
+## Repository Layout
- # libjpeg-dev is for some wayback/pillow stuff
- sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential
- pip3 install --user pipenv
+**./proposals/** design documentation and change proposals
-On macOS:
+**./python/** contains scripts and utilities for ingesting content from wayback
+and/or the web (via save-page-now API), and other processing pipelines
- brew install libjpeg pipenv
+**./sql/** contains schema, queries, and backfill scripts for a Postgres SQL
+database index (eg, file metadata, CDX, and GROBID status tables).
-Each directory has it's own environment. Do something like:
+**./pig/** contains a handful of Pig scripts, as well as some unittests
+implemented in python. Only rarely used.
- cd mapreduce
- pipenv install --dev
- pipenv shell
+**./scalding/** contains Hadoop jobs written in Scala using the Scalding
+framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
+brings type safety and compiled performance. Mostly DEPRECATED.
-## Possible Issues with Setup
+**./python_hadoop/** contains Hadoop streaming jobs written in python using the
+`mrjob` library. Mostly DEPRECATED.
-Bryan had `~/.local/bin` in his `$PATH`, and that seemed to make everything
-work.
+
+## Running Python Code
+
+You need python3.8 (or python3.6+ and `pyenv`) and `pipenv` to set up the
+environment. You may also need the debian packages `libpq-dev` and `
+`python-dev` to install some dependencies.
+
+
+## Running Hadoop Jobs (DEPRECATED)
+
+The `./please` python3 wrapper script is a helper for running jobs (python or
+scalding) on the IA Hadoop cluster. You'll need to run the setup/dependency
+tasks first; see README files in subdirectories.
diff --git a/RUNBOOK.md b/RUNBOOK.md
new file mode 100644
index 0000000..33d4711
--- /dev/null
+++ b/RUNBOOK.md
@@ -0,0 +1,44 @@
+
+## Process Un-GROBID-ed PDFs from Wayback
+
+Sometimes ingest doesn't pick up everything, or we do some heuristic CDX
+import, and we want to run GROBID over all the PDFs that haven't been processed
+yet. Only want one CDX line per `sha1hex`.
+
+A hybrid SQL/UNIX way of generating processing list:
+
+ psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json
+
+From here, there are two options: enqueue in Kafka and let workers run, or
+create job files and run them using local worker and GNU/parallel.
+
+#### Kafka
+
+Copy/transfer to a Kafka node; load a sample and then the whole output:
+
+ head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1
+ cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1
+
+#### Local JSON
+
+Older example; if this fails, need to re-run entire thing:
+
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+TODO: is it possible to use job log with millions of `--pipe` inputs? That
+would be more efficient in the event of failure.
+
+## GROBID over many .zip files
+
+Want to use GNU/Parallel in a mode that will do retries well:
+
+ fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
+ sort | \
+ parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+
+After starting, check that messages are actually getting pushed to kafka
+(producer failures can be silent!). If anything goes wrong, run the exact same
+command again. The sort is to ensure jobs are enqueued in the same order again;
+could also dump `fd` output to a command file first.
+
diff --git a/TODO b/TODO
index 5e9220b..77b48c9 100644
--- a/TODO
+++ b/TODO
@@ -1,22 +1,51 @@
+## Kafka Pipelines
+
+- after network split, mass restarting import/harvest stuff seemed to
+ completely reset consumergroups (!). bunch of LeaderNotFoundError
+ => change/update consumer group config
+ => ensure we are recording timestamps to allow timestamp-based resets
+- refactor python kafka clients (slack convo with kenji+dvd)
+ => try librdkafka?
+ => switch to python-kafka?
+- monitoring/alerting of consumergroup offsets
+ => start with crude python script?
+- document: need to restart all consumers after brokers restart
+- operate on batches, using threads/async, and reduce worker (process) counts
+ dramatically
+
+source of kafka-manager weirdness?
+ Dec 02 01:05:40 wbgrp-svc263.us.archive.org kafka-manager[7032]: org.apache.kafka.common.protocol.types.SchemaException: Error reading field 'user_data': java.nio.BufferUnderflowException
+ Dec 02 01:05:40 wbgrp-svc263.us.archive.org kafka-manager[7032]: [error] k.m.a.c.KafkaManagedOffsetCache - Failed to get member metadata from group summary and member summary : grobid-hbase-insert : MemberSummary(pykafka-8128e0be-4952-4e79-8644-a52987421259,pykafka,/207.241.225.228,[B@6c368f37,[B@2b007e01)
+
+## Other
+
+- paper match heuristic: include 10.1007%2F978-3-319-49304-6_18 (URL-escaped slash)
+- catch EOFFail fetching from wayback
+- "author counts match" in scoring
+- refactor "scorable" to "matchable"
+- look at refactoring to reduce JSON serializations
+- QA tool for matches (PDF + Crossref JSON + landing page?)
+ => python; talks directly to HBase
+- author counts should match (+/- one?)
+
+match strategies (hbase columns):
+- legacy_doi
+- url_doi
+- grobid_crossref (doi)
+- grobid_fatcat (fatcat ID)
+
+scalding:
+- better JSON library
+- less verbose sbt test output (set log level to WARN)
+- auto-formatting: addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.6.0-RC3")
+
pig:
- potentially want to *not* de-dupe CDX lines by uniq sha1 in all cases; run
this as a second-stage filter? for example, may want many URL links in fatcat
for a single file (different links, different policies)
+- fix pig gitlab-ci tests (JAVA_HOME)
+python:
- include input file name (and chunk? and CDX?) in sentry context
-- play with test image on older releases (eg, trusty)
-
- how to get argument (like --hbase-table) into mrjob.conf, or similar?
-- fix pig gitlab-ci tests (JAVA_HOME). also make fetch_deps *way* more quiet
-- sentry: https://github.com/getsentry/raven-python
-
-potential helpers:
-- https://github.com/martinblech/xmltodict
-- https://github.com/trananhkma/fucking-awesome-python#text-processing
-- https://github.com/blaze/blaze (for catalog/analytics)
-- validation: https://github.com/pyeve/cerberus
-- testing (to replace nose):
- - https://github.com/CleanCut/green
- - pytest
- - mamba ("behavior driven")
diff --git a/blobs/README.md b/blobs/README.md
new file mode 100644
index 0000000..555db92
--- /dev/null
+++ b/blobs/README.md
@@ -0,0 +1,86 @@
+
+This document describes sandcrawler/fatcat use of "blob store" infrastructure
+for storing hundreds of millions of small files. For example, GROBID XML
+documents, jpeg thumbnails of PDFs.
+
+The basic feature requirements for this system are:
+
+- don't need preservation data resiliency: all this data is derived from
+ primary content, and is usually redundantly stored in Kafka topics (and thus
+ can be re-indexed to any server bounded only by throughput of the object
+ store service; Kafka is usually faster)
+- don't require SSDs or large amounts of RAM. Ability to accelerate performance
+ with additional RAM or moving indexes to SSD is nice, but we will be using
+ spinning disks for primary data storage
+- hundreds of millions or billions of objects, fetchable by a key we define
+- optional transparent compression (for text and XML)
+- typical object (file) size of 5-200 KBytes uncompressed, want to support up
+ to several MBytes
+- very simple internal API for GET/PUT (S3 API compatible is good)
+- ability to proxy to HTTP publicly for reads (eg, HTTP fall-back with no
+ authenticaiton), controllable by at least bucket granularity
+
+## Infrastructure
+
+`minio` was used initially, but did not scale well in number of files. We
+currently use seaweedfs. Any S3-compatible key/value store should work in
+theory. openlibrary.org has used WARCs in petabox items in the past. Actual
+cloud object stores tend to be expensive for this kind of use case.
+
+The facebook "haystack" project (and whitepaper) are good background reading
+describing one type of system that works well for this application.
+
+
+## Bucket / Folder Structure
+
+Currently we run everything off a single server, with no redundancy. There is
+no QA/prod distinction.
+
+Setting access control and doing bulk deletions is easiest at the bucket level,
+less easy at the folder level, most difficult at the suffix (file extention)
+level.
+
+For files that are derived from PDFs, we use the SHA-1 (in lower-case hex) of
+the source PDF to contruct keys. We generate nested "directories" from the hash
+to limit the number of keys per "directory" (even though in S3/seaweedfs there
+are no actual directories involved). The structure looks like:
+
+ <bucket>/<folder>/<byte0>/<byte1>/<sha1hex><suffix>
+
+Eg:
+
+ sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml
+
+The nesting is sort of a hold-over from minio (where files were actually
+on-disk), but seems worth keeping in case we end up switching storage systems
+in the future.
+
+## Existing Content
+
+sandcrawler: internal/controlled access to PDF derivatives
+ grobid: TEI-XML documents
+ extension: .tei.xml
+ text: raw pdftotext (or other text transform)
+ extension: .txt
+
+thumbnail: public bucket for thumbnail images
+ pdf: thumbnails from PDF files
+ extension: .180px.jpg
+
+## Proxy and URLs
+
+Internal HTTP access via:
+
+ http://wbgrp-svc169.us.archive.org:8333/<bucket>/<key>
+
+Public access via:
+
+ https://blobs.fatcat.wiki/<bucket>/<key>
+
+Eg:
+
+ http://wbgrp-svc169.us.archive.org:8333/testing/small.txt
+ http://wbgrp-svc169.us.archive.org:8333/sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml
+ https://blobs.fatcat.wiki/testing/small.txt
+ https://blobs.fatcat.wiki/thumbnail/pdf/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.180px.jpg
+
diff --git a/blobs/minio/README.md b/blobs/minio/README.md
new file mode 100644
index 0000000..d8f1c69
--- /dev/null
+++ b/blobs/minio/README.md
@@ -0,0 +1,74 @@
+
+minio is used as an S3-compatible blob store. Initial use case is GROBID XML
+documents, addressed by the sha1 of the PDF file the XML was extracted from.
+
+Note that on the backend minio is just storing objects as files on disk.
+
+## Deploying minio Server
+
+It seems to be important to use a version of minio from at least December 2019
+era for on-disk compression to actually work.
+
+Currently install minio (and mc, the minio client) in prod by simply
+downloading the binaries and calling from systemd.
+
+## Buckets and Directories
+
+Hosts and buckets:
+
+ localhost:sandcrawler-dev
+ create locally for development (see below)
+
+ cluster:sandcrawler
+ main sandcrawler storage bucket, for GROBID output and other derivatives.
+ Note it isn't "sandcrawler-prod", for backwards compatibility reasons.
+
+ cluster:sandcrawler-qa
+ for, eg, testing on cluster servers
+
+ cluster:unpaywall
+ subset of sandcrawler content crawled due to unpaywall URLs;
+ potentially made publicly accessible
+
+Directory structure within sandcrawler buckets:
+
+ grobid/2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml
+ SHA1 (lower-case hex) of PDF that XML was extracted from
+
+Create new buckets like:
+
+ mc mb cluster/sandcrawler-qa
+
+## Development
+
+Run minio server locally, with non-persisted data:
+
+ docker run -p 9000:9000 minio/minio server /data
+
+Credentials are `minioadmin:minioadmin`. Install `mc` client utility, and
+configure:
+
+ mc config host add localhost http://localhost:9000 minioadmin minioadmin
+
+Then create dev bucket:
+
+ mc mb --ignore-existing localhost/sandcrawler-dev
+
+A common "gotcha" with `mc` command is that it will first look for a local
+folder/directory with same name as the configured remote host, so make sure
+there isn't a `./localhost` folder.
+
+
+## Users
+
+Create a new readonly user like:
+
+ mc admin user add sandcrawler unpaywall $RANDOM_SECRET_KEY readonly
+
+Make a prefix within a bucket world-readable like:
+
+ mc policy set download cluster/unpaywall/grobid
+
+## Config
+
+ mc admin config set aitio compression extensions=.txt,.log,.csv,.json,.tsv,.pdf,.xml mime_types=text/csv,text/plain,application/json,application/xml,application/octet-stream,application/tei+xml
diff --git a/blobs/minio/minio.conf b/blobs/minio/minio.conf
new file mode 100644
index 0000000..2e93f9a
--- /dev/null
+++ b/blobs/minio/minio.conf
@@ -0,0 +1,14 @@
+
+# Volume to be used for MinIO server.
+MINIO_VOLUMES="/sandcrawler-minio/data"
+# Use if you want to run MinIO on a custom port.
+MINIO_OPTS="--address :9000"
+# Access Key of the server.
+MINIO_ACCESS_KEY=REDACTED
+# Secret key of the server.
+MINIO_SECRET_KEY=REDACTED
+
+# may need to set these manually using `mc admin config get`, edit the JSON, then `set`
+MINIO_COMPRESS="on"
+MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv"
+MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream"
diff --git a/blobs/seaweedfs/README.md b/blobs/seaweedfs/README.md
new file mode 100644
index 0000000..d19e9e0
--- /dev/null
+++ b/blobs/seaweedfs/README.md
@@ -0,0 +1,9 @@
+
+## HOWTO: Create new bucket in SeaweedFS
+
+Log in to the seaweedfs VM.
+
+Run `weed shell` to start a shell, then:
+
+ bucket.create -name <bucket>
+
diff --git a/blobs/tasks.md b/blobs/tasks.md
new file mode 100644
index 0000000..34dec8f
--- /dev/null
+++ b/blobs/tasks.md
@@ -0,0 +1,53 @@
+
+## Backfill GROBID XML to Blob Store
+
+Initially ran this when spinning up new seaweedfs server to replace minio. At
+this time grobid persist worker was in db-only mode, as minio was too slow to
+accept uploads. Rough plan is to:
+
+1. run grobid persist worker from Kafka with a new temporary consumer group,
+ from the start of the GROBID output topic
+2. when it gets to end, stop the *regular* consumer group while this one is
+ still running. with temporary worker still running, at that point in time
+ entire topic should be in S3
+3. then reconfigure regular worker to db+s3 mode. halt the temporary worker,
+ restart the regular one with new config, run it indefinitely
+
+Consumer group isn't an arg, so just edit `persist_worker.py` and set it to
+`persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode
+didn't try to connect to postgresql.
+
+Commands:
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
+ => run briefly, then kill
+
+On kafka-broker worker:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run
+
+Then run 2x instances of worker (same command as above):
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+
+At this point CPU-limited on this worker by the python processes (only 4 cores
+on this machine).
+
+Check in weed shell:
+
+ weed shell
+
+ > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml
+ [...]
+ "isGzipped": false
+ [...]
+ "mime": "application/xml",
+ [...]
+
+An open question is if we should have separate buckets per derive type. Eg, a
+GROBID XML bucket separate from thumbnails bucket. Or are prefix directories
+enough. Basically this comes down to whether we want things mixed together at
+the volume level. I think we should keep separate.
+
+Need to set the mimetype in the upload for gzip on XML?
diff --git a/cdx-record-pipeline/README.md b/cdx-record-pipeline/README.md
deleted file mode 100644
index 797b8eb..0000000
--- a/cdx-record-pipeline/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-CDX Record Pipeline (GrobId Edition)
-=====================================
-
-Hadoop based pipeline to process PDFs from a specified IA CDX dataset
-
-## Local mode example ##
-
-```
-cat -n /home/bnewbold/100k_random_gwb_pdf.cdx | ./cdx-record-pipeline.py
-
-```
-
-## Cluster mode example ##
-
-```
-input=100k_random_gwb_pdf.cdx
-output=100k_random_gwb_pdf.out
-lines_per_map=1000
-
-hadoop jar /home/webcrawl/hadoop-2/hadoop-mapreduce/hadoop-streaming.jar
- -archives "hdfs://ia802400.us.archive.org:6000/lib/cdx-record-pipeline-venv.zip#cdx-record-pipeline-venv"
- -D mapred.reduce.tasks=0
- -D mapred.job.name=Cdx-Record-Pipeline
- -D mapreduce.job.queuename=extraction
- -D mapred.line.input.format.linespermap=${lines_per_map}
- -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat
- -input ${input}
- -output ${output}
- -mapper cdx-record-pipeline.py
- -file cdx-record-pipeline.py
-
-```
-
diff --git a/cdx-record-pipeline/cdx-record-pipeline.py b/cdx-record-pipeline/cdx-record-pipeline.py
deleted file mode 100755
index 9e521bf..0000000
--- a/cdx-record-pipeline/cdx-record-pipeline.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!./cdx-record-pipeline-venv/bin/python
-'''
-GrobId PDF Pipeline Test
-Read in CDX lines and query GROBID server for each PDF resource
-TODO: Testing / HBase integration -- Bryan will update as needed
-'''
-import os
-import re
-import sys
-import base64
-import hashlib
-import urllib
-import urlparse
-import re
-import string
-from wayback.resource import Resource
-from wayback.resource import ArcResource
-from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
-from StringIO import StringIO
-import requests
-import sys
-
-def process_pdf_using_grobid(content_buffer, debug_line):
- """Query GrobId server & process response
- """
- GROBID_SERVER="http://wbgrp-svc096.us.archive.org:8070"
- content = content_buffer.read()
- r = requests.post(GROBID_SERVER + "/api/processFulltextDocument",
- files={'input': content})
- if r.status_code is not 200:
- print("FAIL (Grobid: {}): {}".format(r.content.decode('utf8'), debug_line))
- else:
- print("SUCCESS: " + debug_line)
-
-class Cdx_Record_Pipeline(object):
-
- def read_cdx_and_parse(self, parser_func, accepted_mimes = []):
- """Read in CDX lines and process PDF records fetched over HTTP
- """
- rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
- for line in sys.stdin:
- line = line.rstrip()
- cdx_line = line.split()
- #ignoring NLine offset
- if len(cdx_line) != 12:
- continue
- cdx_line = cdx_line[1:]
- (src_url, timestamp, mime, record_location, record_offset, record_length) = (cdx_line[2], cdx_line[1], cdx_line[3], cdx_line[-1], cdx_line[-2], cdx_line[-3])
- if '-' == record_length or not record_location.endswith('arc.gz') or mime not in accepted_mimes:
- continue
- orig_url = cdx_line[2]
- debug_line = ' '.join(cdx_line)
- try:
- record_location = 'http://archive.org/download/' + record_location
- record_offset = int(record_offset)
- record_length = int(record_length)
- resource_data = rstore.load_resource(record_location, record_offset, record_length)
- parser_func(resource_data.open_raw_content(), debug_line)
- except:
- continue
-
-# main()
-#_______________________________________________________________________________
-if __name__ == '__main__':
- cdx_record_pipeline = Cdx_Record_Pipeline()
- cdx_record_pipeline.read_cdx_and_parse(process_pdf_using_grobid, ['application/pdf', 'application/x-pdf'])
diff --git a/cdx-record-pipeline/requirements.txt b/cdx-record-pipeline/requirements.txt
deleted file mode 100644
index 17b803f..0000000
--- a/cdx-record-pipeline/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---extra-index-url https://devpi.archive.org/wb/prod/
-wayback==0.2.1.2
-GlobalWayback==0.3
diff --git a/extra/docker/README.md b/extra/docker/README.md
new file mode 100644
index 0000000..23cb5b2
--- /dev/null
+++ b/extra/docker/README.md
@@ -0,0 +1,11 @@
+
+The docker-compose script in this directory may be helpful for local
+development. It starts several dependant services, such as Kafka, minio, etc.
+
+PostgreSQL is assumed to be running natively on localhost, not under docker. It
+should be possible to add postgresql to the docker-compose file, but some
+developers (bnewbold) prefer to run it separately to make things like attaching
+with `psql` easier.
+
+There is no current motivation or plan to deploy sandcrawler services using
+docker, so there is no Dockerfile for the system itself.
diff --git a/extra/docker/docker-compose.yml b/extra/docker/docker-compose.yml
new file mode 100644
index 0000000..196879f
--- /dev/null
+++ b/extra/docker/docker-compose.yml
@@ -0,0 +1,39 @@
+version: '2'
+services:
+ zookeeper:
+ image: wurstmeister/zookeeper
+ ports:
+ - "2181:2181"
+ kafka:
+ image: wurstmeister/kafka:2.11-2.0.0
+ ports:
+ - "9092:9092"
+ environment:
+ #HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2"
+ KAFKA_BROKER_ID: 1
+ KAFKA_ADVERTISED_HOST_NAME: 127.0.0.1
+ KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
+ KAFKA_CREATE_TOPICS: "fatcat-dev.changelog:1:1,fatcat-dev.release-updates:3:1:compact"
+ KAFKA_MESSAGE_MAX_BYTES: 50000000
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock
+ depends_on:
+ - zookeeper
+ postgrest:
+ image: postgrest/postgrest
+ network_mode: "host"
+ ports:
+ - "3000:3000"
+ environment:
+ PGRST_DB_URI: "postgres://fatcat:tactaf@localhost/sandcrawler"
+ PGRST_DB_ANON_ROLE: "fatcat"
+ minio:
+ image: minio/minio
+ ports:
+ - "9000:9000"
+ environment:
+ MINIO_ACCESS_KEY: minioadmin
+ MINIO_SECRET_KEY: minioadmin
+ entrypoint: sh
+ command: -c "mkdir -p /tmp/minio/sandcrawler && mkdir -p /tmp/minio/thumbnail && mkdir -p /tmp/minio/sandcrawler-dev && /usr/bin/minio server /tmp/minio"
+
diff --git a/fetch_hadoop.sh b/fetch_hadoop.sh
new file mode 100755
index 0000000..633f8fa
--- /dev/null
+++ b/fetch_hadoop.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# This script was originally only for pig scripts; now it can also be used to
+# run scalding code locally (via please)
+
+set -euo pipefail
+
+#PIG_VERSION="0.12.0-cdh5.2.0"
+# Using more recent version to work around snappy classpath problem
+PIG_VERSION="0.17.0"
+HADOOP_VERSION="2.3.0-cdh5.0.1"
+
+mkdir -p pig/deps/
+cd pig/deps/
+
+# Fetch Hadoop Command
+echo https://archive.cloudera.com/cdh5/cdh/5/hadoop-${HADOOP_VERSION}.tar.gz
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
+wget -c https://archive.org/serve/hadoop_pig_mirror/hadoop-${HADOOP_VERSION}.tar.gz
+echo "Extracting Hadoop (takes a minute)..."
+tar xvf hadoop-${HADOOP_VERSION}.tar.gz > /dev/null
+ln -fs hadoop-${HADOOP_VERSION} hadoop
+
+# Fetch Pig
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
+#wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz
+wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz
+echo "Extracting Pig (takes a minute)..."
+tar xvf pig-${PIG_VERSION}.tar.gz > /dev/null
+ln -fs pig-${PIG_VERSION} pig
+
+# No 'readlink -f' on macOS
+# https://stackoverflow.com/a/24572274/4682349
+JAVA_HOME=$(perl -MCwd -e 'print Cwd::abs_path shift' /usr/bin/java | sed "s:bin/java::")
+./pig/bin/pig -x local -version
+./hadoop/bin/hadoop version
+
diff --git a/hbase/howto.md b/hbase/howto.md
new file mode 100644
index 0000000..26d33f4
--- /dev/null
+++ b/hbase/howto.md
@@ -0,0 +1,42 @@
+
+Commands can be run from any cluster machine with hadoop environment config
+set up. Most of these commands are run from the shell (start with `hbase
+shell`). There is only one AIT/Webgroup HBase instance/namespace; there may be
+QA/prod tables, but there are not QA/prod clusters.
+
+## Create Table
+
+Create column families (note: not all individual columns) with something like:
+
+ create 'wbgrp-journal-extract-0-qa', 'f', 'file', {NAME => 'grobid0', COMPRESSION => 'snappy'}
+
+## Run Thrift Server Informally
+
+The Thrift server can technically be run from any old cluster machine that has
+Hadoop client stuff set up, using:
+
+ hbase thrift start -nonblocking -c
+
+Note that this will run version 0.96, while the actual HBase service seems to
+be running 0.98.
+
+To interact with this config, use happybase (python) config:
+
+ conn = happybase.Connection("bnewbold-dev.us.archive.org", transport="framed", protocol="compact")
+ # Test connection
+ conn.tables()
+
+## Queries From Shell
+
+Fetch all columns for a single row:
+
+ hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ'
+
+Fetch multiple columns for a single row, using column families:
+
+ hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 'f', 'file'
+
+Scan a fixed number of rows (here 5) starting at a specific key prefix, all
+columns:
+
+ hbase> scan 'wbgrp-journal-extract-0-qa',{LIMIT=>5,STARTROW=>'sha1:A'}
diff --git a/hbase/notes.txt b/hbase/notes.txt
new file mode 100644
index 0000000..20f406f
--- /dev/null
+++ b/hbase/notes.txt
@@ -0,0 +1,232 @@
+
+## Notes on HBase features
+
+Decent one-page introduction:
+https://www.tutorialspoint.com/hbase/hbase_overview.htm
+
+Question: what version of hbase are we running? what on-disk format?
+
+=> Server: HBase 0.98.6-cdh5.3.1
+=> Client: HBase 0.96.1.1-cdh5.0.1
+=> As of 2018, 1.2 is stable and 2.0 is released.
+
+Question: what are our servers? how can we monitor?
+
+=> http://ia802402.us.archive.org:6410/master-status
+
+I haven't been able to find a simple table of hbase version and supported/new
+features over the years (release notes are too detailed).
+
+Normal/online mapreduce over tables sounds like it goes through a "region
+server" and is slow. Using snapshots allows direct access to underlying
+tablets on disk? Or is access always direct?
+
+Could consider "Medium-sized Object" support for 100 KByte to 10 MByte sized
+files. This seems to depend on HBase v3, which was added in HBase 0.98, so we
+can't use it yet.
+
+Do we need to decide on on-disk format? Just stick with defaults.
+
+Looks like we use the `happybase` python package to write. This is packaged in
+debian, but only for python2. There is also a `starbase` python library
+wrapping the REST API.
+
+There is a "bulk load" mechanism for going directly from HDFS into HBase, by
+creating HFiles that can immediately be used by HBase.
+
+## Specific "Queries" needed
+
+"Identifier" will mostly want to get "new" (unprocessed) rows to process. It
+can do so by
+
+Question: if our columns are mostly "dense" within a column group (aka, mostly
+all or none set), what is the value of splitting out columns instead of using a
+single JSON blob or something like that? Not needing to store the key strings?
+Being able to do scan filters? The later obviously makes sense in some
+contexts.
+
+- is there a visible distinction between "get(table, group:col)" being
+ zero-length (somebody put() an empty string (like "") versus that column not
+ having being written to?
+
+## Conversation with Noah about Heritrix De-Dupe
+
+AIT still uses HBase for url-agnostic de-dupe, but may move away from it. Does
+about 250 reads/sec (estimate based on URL hits per quarter). Used to have more
+problems (region servers?) but haven't for a while. If a crawler can't reach
+HBase, it will "fail safe" and retain the URL. However, Heritrix has trouble
+starting up if it can't connect at start. Uses the native JVM drivers.
+
+Key is "sha1:<base32>-<crawlid>", so in theory they can control whether to
+dedupe inside or outside of individual crawls (or are they account IDs?). IIRC
+all columns were in one family, and had short names (single character). Eg:
+
+ hbase(main):012:0> scan 'ait-prod-digest-history',{LIMIT=>5,STARTROW=>'sha1:A'}
+ sha1:A22222453XRJ63AC7YCSK46APWHTJKFY-2312 column=f:c, timestamp=1404151869546, value={"c":1,"u":"http://www.theroot.com/category/views-tags/black-fortune-500-ceos","d":"2012-02-23T08:27:10Z","o":8867552,"f":"ARCHIVEIT-REDACTED-20120223080317-00009-crawling201.us.archive.org-6681.warc.gz"}
+
+Code for url-agnostic dedupe is in:
+
+ heritrix3/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBaseContentDigestHistory.java
+
+Crawl config snippet:
+
+ [...]
+ <bean id="iaWbgrpHbase" class="org.archive.modules.recrawl.hbase.HBase">
+ <property name="properties">
+ <map>
+ <entry key="hbase.zookeeper.quorum" value="mtrcs-zk1,mtrcs-zk2,mtrcs-zk3,mtrcs-zk4,mtrcs-zk5"/>
+ <entry key="hbase.zookeeper.property.clientPort" value="2181"/>
+ <entry key="hbase.client.retries.number" value="2"/>
+ </map>
+ </property>
+ </bean>
+ <bean id="hbaseDigestHistoryTable" class="org.archive.modules.recrawl.hbase.HBaseTable">
+ <property name="name" value="ait-prod-digest-history"/>
+ <property name="create" value="true"/>
+ <property name="hbase">
+ <ref bean="iaWbgrpHbase"/>
+ </property>
+ </bean>
+ <bean id="hbaseDigestHistory" class="org.archive.modules.recrawl.hbase.HBaseContentDigestHistory">
+ <property name="addColumnFamily" value="true"/>
+ <property name="table">
+ <ref bean="hbaseDigestHistoryTable"/>
+ </property>
+ <property name="keySuffix" value="-1858"/>
+ </bean>
+ <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
+ <property name="processors">
+ <list>
+ <bean class="org.archive.modules.recrawl.ContentDigestHistoryLoader">
+ <property name="contentDigestHistory">
+ <ref bean="hbaseDigestHistory"/>
+ </property>
+ </bean>
+ <ref bean="warcWriter"/>
+ <bean class="org.archive.modules.recrawl.ContentDigestHistoryStorer"/>
+ [...]
+
+## Kenji Conversation (global wayback)
+
+Spoke with Kenji, who had previous experience trying to use HBase for crawl
+de-dupe. Take away was that it didn't perform well for them even back then,
+with 3000+ req/sec. AIT today is more like 250 req/sec.
+
+Apparently CDX API is just the fastest thing ever; stable slow latency on reads
+(~200ms!), and it takes an hour for "writes" (bulk deltacdx or whatever).
+
+Sounds like HBase in particular struggled with concurrent heavy reads and
+writes; frequent re-compaction caused large tail latencies, and when region
+servers were loaded they would time-out of zookeeper.
+
+He pushed to use elasticsearch instead of hbase to store extracted fulltext, as
+a persistant store, particularly if we end up using it for fulltext someday. He
+thinks it operates really well as a datastore. I am not really comfortable with
+this usecase, or depending on elastic as a persistant store in general, and it
+doesn't work for the crawl dedupe case.
+
+He didn't seem beholden to the tiny column name convention.
+
+
+## Google BigTable paper (2006)
+
+Hadn't read this paper in a long time, and didn't really understand it at the
+time. HBase is a clone of BigTable.
+
+They used bigtable to store crawled HTML! Very similar use-case to our journal
+stuff. Retained multiple crawls using versions; version timestamps are crawl
+timestamps, aha.
+
+Crazy to me how the whole hadoop world is Java (garbage collected), while all
+the google stuff is C++. So many issues with hadoop are performance/latency
+sensitive; having garbage collection in a situation when RAM is tight and
+network timeouts are problematic seems like a bad combination for operability
+(median/optimistic performance is probably fine)
+
+"Locality" metadata important for actually separating column families. Column
+name scarcity doesn't seem to be a thing/concern. Compression settings
+important. Key selection to allow local compression seems important to them.
+
+Performance probably depends a lot on 1) relative rate of growth (slow if
+re-compressing, etc), 2)
+
+Going to want/need table-level monitoring, probably right from the start.
+
+## Querying/Aggregating/Stats
+
+We'll probably want to be able to run simple pig-style queries over HBase. How
+will that work? A couple options:
+
+- query hbase using pig via HBaseStorage and HBaseLoader
+- hive runs on map/reduce like pig
+- drill is an online/fast SQL query engine with HBase back-end support. Not
+ map/reduce based; can run from a single server. Supports multiple "backends".
+ Somewhat more like pig; "schema-free"/JSON.
+- impala supports HBase backends
+- phoenix is a SQL engine on top of HBase
+
+## Hive Integration
+
+Run `hive` from a hadoop cluster machine:
+
+ bnewbold@ia802405$ hive --version
+ Hive 0.13.1-cdh5.3.1
+ Subversion file:///var/lib/jenkins/workspace/generic-package-ubuntu64-12-04/CDH5.3.1-Packaging-Hive-2015-01-27_16-23-36/hive-0.13.1+cdh5.3.1+308-1.cdh5.3.1.p0.17~precise -r Unknown
+ Compiled by jenkins on Tue Jan 27 16:38:11 PST 2015
+ From source with checksum 1bb86e4899928ce29cbcaec8cf43c9b6
+
+Need to create mapping tables:
+
+ CREATE EXTERNAL TABLE journal_extract_qa(rowkey STRING, grobid_status STRING, file_size STRING)
+ STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+ WITH SERDEPROPERTIES ('hbase.columns.mapping' = ':key,grobid0:status_code,file:size')
+ TBLPROPERTIES ('hbase.table.name' = 'wbgrp-journal-extract-0-qa');
+
+Maybe:
+
+ SET hive.aux.jars.path = file:///home/webcrawl/hadoop-2/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///home/webcrawl/hadoop-2/hive/lib/hbase-client-0.96.1.1-cdh5.0.1.jar;
+ SELECT * from journal_extract_qa LIMIT 10;
+
+Or?
+
+ ADD jar /usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar;
+ ADD jar /usr/lib/hive/lib/hive-shims-common-secure-0.13.1-cdh5.3.1.jar;
+ ADD jar /usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar;
+ ADD jar /usr/lib/hbase/hbase-client-0.98.6-cdh5.3.1.jar;
+ ADD jar /usr/lib/hbase/hbase-common-0.98.6-cdh5.3.1.jar;
+
+Or, from a real node?
+
+ SET hive.aux.jars.path = file:///usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///usr/lib/hbase/lib/hbase-client-0.98.6-cdh5.3.1.jar,file:///usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar;
+ SELECT * from journal_extract_qa LIMIT 10;
+
+Getting an error:
+
+ Exception in thread "main" java.lang.NoSuchMethodError: org.apache.hadoop.hdfs.client.HdfsAdmin.getEncryptionZoneForPath(Lorg/apache/hadoop/fs/Path;)Lorg/apache/hadoop/hdfs/protocol/EncryptionZone;
+
+The HdfsAdmin admin class is in hadoop-hdfs, but `getEncryptionZoneForPath`
+isn't in there. See upstream commit:
+
+ https://github.com/apache/hadoop/commit/20dcb841ce55b0d414885ceba530c30b5b528b0f
+
+## Debugging
+
+List hbase tables known to zookeeper (as opposed to `list` from `hbase shell`):
+
+ hbase zkcli ls /hbase/table
+
+Look for jar files with a given symbol:
+
+ rg HdfsAdmin -a /usr/lib/*/*.jar
+
+## Performance
+
+Should pre-allocate regions for tables that are going to be non-trivially
+sized, otherwise all load hits a single node. From the shell, this seems to
+involve specifying the split points (key prefixes) manually. From the docs:
+
+ http://hbase.apache.org/book.html#precreate.regions
+
+There is an ImportTsv tool which might have been useful for original CDX
+backfill, but :shrug:. It is nice to have only a single pipeline and have it
+work well.
diff --git a/hbase/schema_design.md b/hbase/schema_design.md
new file mode 100644
index 0000000..2db8998
--- /dev/null
+++ b/hbase/schema_design.md
@@ -0,0 +1,79 @@
+
+## PDF Table
+
+Table name: `wbgrp-journal-extract-<version>-<env>`
+
+Eg: `wbgrp-journal-extract-0-prod`
+
+Key is the sha1 of the file, as raw bytes (20 bytes).
+
+Could conceivably need to handle, eg, postscript files, JATS XML, or even HTML
+in the future? If possible be filetype-agnostic, but only "fulltext" file types
+will end up in here, and don't bend over backwards.
+
+Keep only a single version (do we need `VERSIONS => 1`, or is 1 the default?)
+
+IMPORTANT: column names should be unique across column families. Eg, should not
+have both `grobid0:status` and `match0:status`. HBase and some client libraries
+don't care, but some map/reduce frameworks (eg, Scalding) can have name
+collisions. Differences between "orthogonal" columns *might* be OK (eg,
+`grobid0:status` and `grobid1:status`).
+
+Column families:
+
+- `key`: sha1 of the file in base32 (not a column or column family)
+- `f`: heritrix HBaseContentDigestHistory de-dupe
+ - `c`: (json string)
+ - `u`: original URL (required)
+ - `d`: original date (required; ISO 8601:1988)
+ - `f`: warc filename (recommend)
+ - `o`: warc offset (recommend)
+ - `c`: dupe count (optional)
+ - `i`: warc record ID (optional)
+- `file`: crawl and file metadata
+ - `size` (uint64), uncompressed (not in CDX)
+ - `mime` (string; might do postscript in the future; normalized)
+ - `cdx` (json string) with all as strings
+ - `surt`
+ - `url`
+ - `dt`
+ - `warc` (item and file name)
+ - `offset`
+ - `c_size` (compressed size)
+ - `meta` (json string)
+ - `size` (int)
+ - `mime` (str)
+ - `magic` (str)
+ - `magic_mime` (str)
+ - `sha1` (hex str)
+ - `md5` (hex str)
+ - `sha256` (hex str)
+- `grobid0`: processing status, version, XML and JSON fulltext, JSON metadata. timestamp. Should be compressed! `COMPRESSION => SNAPPY`
+ - `status_code` (signed int; HTTP status from grobid)
+ - `quality` (int or string; we define the meaning ("good"/"marginal"/"bad")
+ - `status` (json string from grobid)
+ - `tei_xml` (xml string from grobid)
+ - `tei_json` (json string with fulltext)
+ - `metadata` (json string with author, title, abstract, citations, etc)
+- `match0`: status of identification against "the catalog"
+ - `mstatus` (string; did it match?)
+ - `doi` (string)
+ - `minfo` (json string)
+
+Can add additional groups in the future for additional processing steps. For
+example, we might want to do first pass looking at files to see "is this a PDF
+or not", which out output some status (and maybe certainty).
+
+The Heritrix schema is fixed by the existing implementation. We could
+patch/extend heritrix to use the `file` schema in the future if we decide
+it's worth it. There are some important pieces of metadata missing, so at
+least to start I think we should keep `f` and `file` distinct. We could merge
+them later. `f` info will be populated by crawlers; `file` info should be
+populated when back-filling or processing CDX lines.
+
+If we wanted to support multiple CDX rows as part of the same row (eg, as
+alternate locations), we could use HBase's versions feature, which can
+automatically cap the number of versions stored.
+
+If we had enough RAM resources, we could store `f` (and maybe `file`) metadata
+in memory for faster access.
diff --git a/kafka/debugging_issues.txt b/kafka/debugging_issues.txt
new file mode 100644
index 0000000..007c786
--- /dev/null
+++ b/kafka/debugging_issues.txt
@@ -0,0 +1,48 @@
+
+## 2020-11-12
+
+To reset a consumer group to the offsets from a specific date (or datetime),
+use:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-grobid-s3 --reset-offsets --all-topics --to-datetime 2020-11-09T00:00:00.000
+
+Add `--execute` to actually commit the change.
+
+## 2018-12-02
+
+Had been having some troubles with consumer group partition assignments with
+the grobid-output group and grobid-hbase-insert consumer group. Tried deleting
+and re-creating, which was probbaly a mistake. Also tried to use kafka-broker
+shell scripts to cleanup/debug and didn't work well.
+
+In the end, after re-building the topic, decided to create a new consumer group
+(grobid-hbase-insert2) to get rid of history/crap. Might need to do this again
+in the future, oh well.
+
+A few things learned:
+
+- whatever pykafka "native python" is producing to consumer group offsets
+ doesn't work great with kafka-manager or the shell scripts: consumer instance
+ names don't show. this is an error in shell scripts, and blank/red in
+ kafka-manager
+- restarting kafka-manager takes a while (for it to refresh data?) and it shows
+ inconsistent stuff during that period, but it does result in cleaned up
+ consumer group cached metadata (aka, old groups are cleared)
+- kafka-manager can't fetch JXM info, either due to lack of config or port
+ blocking. should try to fix this for metrics etc
+- it would be nice to be using recent librdkafka everywhere. pykafka can
+ optionally use this, and many other tools do automatically. however, this is
+ a system package, and xenial doesn't have backports (debian stretch does).
+ the version in bionic looks "good enough", so many should try that?
+- there has been a minor release of kafka (2.1) since I installed (!)
+- the burrow (consumer group monitoring) tool is packaged for some version of
+ ubuntu
+
+In general, not feally great about the current setup. Very frustrating that the
+debug/status tools are broken with pykafka native output. Need to at least
+document things a lot better.
+
+Separately, came up with an idea to do batched processing with GROBID: don't
+auto-commit, instead consume a batch (10? or until block), process those, then
+commit. This being a way to get "the batch size returned".
+
diff --git a/kafka/grobid_kafka_notes.txt b/kafka/grobid_kafka_notes.txt
new file mode 100644
index 0000000..a1f7380
--- /dev/null
+++ b/kafka/grobid_kafka_notes.txt
@@ -0,0 +1,60 @@
+
+Will want to be able to scale to 100-200+ fully-utilized cores running GROBID;
+how best to achieve this? will need *many* workers going concurrent HTTP GETs,
+POSTs, and Kafka publishes.
+
+I'm pretty confident we can relax "at least once"/"at most once" constraints in
+this case: infrequent re-processing and missing a tiny fraction of processed
+works should be acceptable, because we will have higher-level checks (eg, the
+'ungrobided' HBase filter/dump).
+
+For the 'ungrobided' topic, use a reasonably large number of partitions, say
+50. This sets max number of worker *processes*, and may be enough for initial
+single-host worker. We can have a python wrapper spawn many child processes
+using multiprocessing library, with completely independent kafka client
+connections in each.
+
+To get more concurrency, each consumer *process* creates a thread pool (or
+process pool?), and a Queue with fixed size. Consumes messages, pushes to
+Queue, workers threads pull and do the rest. golang sure would be nice for
+this...
+
+Need to ensure we have compression enabled, for the GROBID output in
+particular! Probably worth using "expensive" GZIP compression to get extra disk
+savings; latency shouldn't be a big deal here.
+
+## Commands
+
+Load up some example lines, without partition key:
+
+ head -n10 python/tests/files/example_ungrobided.tsv | kafkacat -P -b localhost:9092 -t sandcrawler-qa.ungrobided
+
+Load up some example lines, with partition key:
+
+ head -n10 python/tests/files/example_ungrobided.tsv | awk -F'\t' '{print $1 "\t" $0}' | kafkacat -K$'\t' -P -b localhost:9092 -t sandcrawler-qa.ungrobided
+
+Check ungrobided topic:
+
+ kafkacat -C -b localhost:9092 -t sandcrawler-qa.ungrobided
+
+Check grobid output:
+
+ kafkacat -C -b localhost:9092 -t sandcrawler-qa.grobid-output
+
+## Actual Production Commands
+
+ gohdfs get sandcrawler/output-prod/2018-11-30-2125.55-dumpungrobided/part-00000
+ mv part-00000 2018-11-30-2125.55-dumpungrobided.tsv
+ cat 2018-11-30-2125.55-dumpungrobided.tsv | kafkacat -P -b 127.0.0.1:9092 -t sandcrawler-prod.ungrobided
+
+## Performance
+
+On 2018-11-21, using grobid-vm (svc096) with 30 cores, and running with 50x
+kafka-grobid-worker processes (using systemd parallelization), achieved:
+
+- 2044 PDFs extracted in 197 seconds, or about 10/second
+- that's about 28 hours to process 1 million PDFs
+
+I think this is about all the single machine can handle. To get more throughput
+with multiple machines, might need to tweak worker to use a worker thread-pool
+or some other concurrent pattern (async?).
diff --git a/kafka/howto_rebalance.md b/kafka/howto_rebalance.md
new file mode 100644
index 0000000..093740a
--- /dev/null
+++ b/kafka/howto_rebalance.md
@@ -0,0 +1,43 @@
+
+## Rebalance Storage Between Brokers (kafka-manager web)
+
+For each topic you want to rebalance (eg, the large or high-throughput ones),
+go to the topic page and do the blue "reassign partitions" button (or
+potentially "generate" or "manual").
+
+Monitor progress with the "Reassign Partitions" link at top of the page.
+
+Finally, run a preferred replica election after partition movement is complete.
+
+## Rebalance Storage Between Brokers (CLI)
+
+For example, after adding or removing brokers from the cluster.
+
+Create a list of topics to move, and put it in `/tmp/topics_to_move.json`:
+
+ {
+ "version": 1,
+ "topics": [
+ {"topic": "sandcrawler-shadow.grobid-output"},
+ {"topic": "fatcat-prod.api-crossref"}
+ ]
+ }
+
+On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then
+inspect the output:
+
+ ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | head -n1 | jq . > /tmp/old-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 | jq . > /tmp/new-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | jq .
+
+If that looks good, start the rebalance:
+
+ ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --execute
+
+Then monitor progress:
+
+ ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --verify
+
+Finally, run a preferred replica election after partition movement is complete.
+Currently do this through the web interface (linked above).
diff --git a/kafka/topics.md b/kafka/topics.md
new file mode 100644
index 0000000..a699e16
--- /dev/null
+++ b/kafka/topics.md
@@ -0,0 +1,214 @@
+
+This file lists all the Kafka topics currently used by sandcrawler (and
+fatcat).
+
+NOTE: should use `.` or `_` in topic names, but not both. We chose to use `.`
+
+ENV below is one of `prod` or `qa`.
+
+
+## Topic List
+
+All topics should default to `snappy` compression on-disk, and indefinite
+retention (on both a size and time basis).
+
+ sandcrawler-ENV.grobid-output-pg
+ => output of GROBID processing using grobid_tool.py
+ => schema is sandcrawler-db style JSON: TEI-XML as a field
+ => expected to be large; 12 partitions
+ => use GZIP compression (worth the overhead)
+ => key is sha1hex of PDF; enable key compaction
+
+ sandcrawler-ENV.ungrobided-pg
+ => PDF files in IA needing GROBID processing
+ => schema is sandcrawler-db style JSON. Can be either `cdx` or `petabox` object
+ => fewer partitions with batch mode, but still a bunch (24?)
+ => key is sha1hex of PDF. enable time compaction (6 months?)
+
+ sandcrawler-ENV.ingest-file-requests-daily
+ => was ingest-file-requests previously, but renamed/rebalanced
+ => ingest requests from multiple sources; mostly continuous or pseudo-interactive
+ => schema is JSON; see ingest proposal for fields. small objects.
+ => fewer partitions with batch mode, but still a bunch (24)
+ => can't think of a good key, so none. enable time compaction (3-6 months?)
+
+ sandcrawler-ENV.ingest-file-requests-bulk
+ => ingest requests from bulk crawl sources; background processing
+ => same as ingest-file-requests
+
+ sandcrawler-ENV.ingest-file-requests-priority
+ => ingest requests from bulk crawl sources; background processing
+ => same as ingest-file-requests
+
+ sandcrawler-ENV.ingest-file-results
+ => ingest requests from multiple sources
+ => schema is JSON; see ingest proposal for fields. small objects.
+ => 6 partitions
+ => can't think of a good key, so none; no compaction
+
+ sandcrawler-ENV.pdftrio-output
+ => output of each pdftrio ML classification
+ => schema is JSON; see pdftrio proposal for fields. small objects.
+ => 6 partitions
+ => key is sha1hex of PDF; enable key compaction
+
+ sandcrawler-ENV.unextracted
+ => PDF files in IA needing extraction (thumbnails and text)
+ => schema is sandcrawler-db style JSON. Can be either `cdx` or `petabox` object
+ => fewer partitions with batch mode, but still a bunch (12? 24?)
+ => key is sha1hex of PDF. enable time compaction (6 months?)
+
+ sandcrawler-ENV.pdf-text
+ => fulltext (raw text) and PDF metadata for pdfs
+ => schema is JSON; see pdf_meta proposal for fields. large objects.
+ => 12 partitions
+ => key is sha1hex of PDF; enable key compaction; gzip compression
+
+ sandcrawler-ENV.xml-doc
+ => fulltext XML; mostly JATS XML
+ => schema is JSON, with 'jats_xml' field containing the XML as a string
+ => 6 partitions
+ => key is sha1hex of XML document; enable key compaction; gzip compression
+
+ sandcrawler-ENV.html-teixml
+ => extracted fulltext from HTML; mostly TEI-XML
+ => schema is JSON, with 'tei_xml' field containing the XML as a string
+ => 6 partitions
+ => key is sha1hex of source HTML document; enable key compaction; gzip compression
+
+ sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE
+ => thumbnail images (eg, png, jpg) from PDFs
+ => raw bytes in message (no JSON or other wrapping). fields average 10 KByte
+ => 12 partitions; expect a TByte or so total
+ => key is sha1hex of PDF; enable key compaction; no compression
+
+ fatcat-ENV.api-crossref
+ fatcat-ENV.api-datacite
+ => all new and updated DOIs (regardless of type)
+ => full raw crossref/datacite API objects (JSON)
+ => key: lower-case DOI
+ => ~1TB capacity; 8x crossref partitions, 4x datacite
+ => key compaction possible
+
+ fatcat-ENV.ftp-pubmed
+ => new citations from FTP server, from: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
+ => raw XML, one record per message (PubmedArticle, up to 25k records/day and 650MB/day)
+ => key: PMID
+ => key compaction possible
+
+ fatcat-ENV.api-crossref-state
+ fatcat-ENV.api-datacite-state
+ fatcat-ENV.ftp-pubmed-state
+ fatcat-ENV.oaipmh-pubmed-state
+ fatcat-ENV.oaipmh-arxiv-state
+ fatcat-ENV.oaipmh-doaj-journals-state (DISABLED)
+ fatcat-ENV.oaipmh-doaj-articles-state (DISABLED)
+ => serialized harvester state for ingesters
+ => custom JSON
+ => key: timespan? nothing to start
+ => 1x partitions; time/space limit Ok
+
+ fatcat-ENV.changelog
+ => small-ish objects (not fully expanded/hydrated)
+ => single partition
+ => key: could be changelog index (integer, as string)
+
+ fatcat-ENV.release-updates-v03
+ => contains "fully" expanded JSON objects
+ => v03 is newer v0.3.0 API schema (backwards incompatible)
+ => key: fcid
+ => 8x partitions
+ fatcat-ENV.container-updates
+ => key: fcid
+ => 4x partitions
+ fatcat-ENV.file-updates
+ => key: fcid
+ => 4x partitions
+ fatcat-ENV.work-ident-updates
+ => work identifiers when updated and needs re-indexing (eg, in scholar)
+ => 6x partitions
+ => key: doc ident ("work_{ident}")
+ => key compaction possible; long retention
+
+ scholar-ENV.sim-updates
+ => 6x partitions
+ => key: "sim_item_{}"
+ => key compaction possible; long retention
+ scholar-ENV.update-docs
+ => 12x partitions
+ => key: scholar doc identifer
+ => gzip compression
+ => key compaction possible
+ => short time-based retention (2 months?)
+
+### Deprecated/Unused Topics
+
+ sandcrawler-ENV.ungrobided
+ => PDF files in IA needing GROBID processing
+ => 50x partitions (huge! for worker parallelism)
+ => key: "sha1:<base32>"
+
+ sandcrawler-ENV.grobid-output
+ => output of GROBID processing (from pdf-ungrobided feed)
+ => could get big; 16x partitions (to distribute data)
+ => use GZIP compression (worth the overhead)
+ => key: "sha1:<base32>"; could compact
+
+ fatcat-ENV.oaipmh-pubmed
+ fatcat-ENV.oaipmh-arxiv
+ fatcat-ENV.oaipmh-doaj-journals (DISABLED)
+ fatcat-ENV.oaipmh-doaj-articles (DISABLED)
+ => OAI-PMH harvester output
+ => full XML resource output (just the <<record> part?)
+ => key: identifier
+ => ~1TB capacity; 4x-8x partitions
+ => key compaction possible
+
+## Create fatcat QA topics
+
+If you run these commands for an existing topic, you'll get something like
+`Error while executing topic command : Topic 'fatcat-qa.changelog' already
+exists`; this seems safe, and the settings won't be over-ridden.
+
+ ssh misc-vm
+ cd /srv/kafka-broker/kafka_2.12-2.0.0/bin/
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ungrobided-pg
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.grobid-output-pg --config compression.type=gzip --config cleanup.policy=compact
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests-daily --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-requests-priority --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.file-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.container-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic fatcat-qa.work-ident-updates
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.ftp-pubmed --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-crossref-state
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-datacite-state
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.ftp-pubmed-state
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-pubmed
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-arxiv
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-pubmed-state
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-arxiv-state
+
+ # only 3 partitions in QA
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-text --config compression.type=gzip --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-thumbnail-180px-jpg --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.unextracted
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic scholar-qa.sim-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic scholar-qa.update-docs --config compression.type=gzip --config cleanup.policy=compact --config retention.ms=7889400000
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.xml-doc --config compression.type=gzip --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.html-teixml --config compression.type=gzip --config cleanup.policy=compact
+
diff --git a/mapreduce/.coveragerc b/mapreduce/.coveragerc
deleted file mode 100644
index 6235f57..0000000
--- a/mapreduce/.coveragerc
+++ /dev/null
@@ -1,3 +0,0 @@
-[run]
-omit = tests/*
-source = .
diff --git a/mapreduce/Pipfile.lock b/mapreduce/Pipfile.lock
deleted file mode 100644
index f21e01b..0000000
--- a/mapreduce/Pipfile.lock
+++ /dev/null
@@ -1,1142 +0,0 @@
-{
- "_meta": {
- "hash": {
- "sha256": "d28f89355ce6520af7e275ad7bbc944acff5946d3701d3336bb268f4a5e82980"
- },
- "pipfile-spec": 6,
- "requires": {
- "python_version": "3.5"
- },
- "sources": [
- {
- "name": "ia",
- "url": "https://devpi.archive.org/wb/prod",
- "verify_ssl": true
- },
- {
- "name": "pypi",
- "url": "https://pypi.python.org/simple",
- "verify_ssl": true
- }
- ]
- },
- "default": {
- "boto3": {
- "hashes": [
- "sha256:13ad5f64a247d655a27dca83274588e9d14cba61b38d3d4fd2b011b7197d88dd",
- "sha256:a56b21efbc994580fc9cef454f0f949745c152326f939aed6609d1c47b2a0f8f"
- ],
- "version": "==1.7.4"
- },
- "botocore": {
- "hashes": [
- "sha256:5602738392ecde5c02a06a3b02de07171f440a44cdfef0aadff4b59567359607",
- "sha256:77f2869b8c27afbab78b72ce6b74c75923421f364c7a0153ac1a38858c59cd91"
- ],
- "version": "==1.10.4"
- },
- "cachetools": {
- "hashes": [
- "sha256:4319bbb78172e7bcf99423e1ecd6914b32336ccfe97d2058ffe62e641a7f3abe",
- "sha256:ede01f2d3cbd6ddc9e35e16c2b0ce011d8bb70ce0dbaf282f5b4df24b213bc5d"
- ],
- "version": "==2.0.1"
- },
- "certifi": {
- "hashes": [
- "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7",
- "sha256:9fa520c1bacfb634fa7af20a76bcbd3d5fb390481724c597da32c719a7dca4b0"
- ],
- "version": "==2018.4.16"
- },
- "chardet": {
- "hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
- ],
- "version": "==3.0.4"
- },
- "click": {
- "hashes": [
- "sha256:29f99fc6125fbc931b758dc053b3114e55c77a6e4c6c3a2674a2dc986016381d",
- "sha256:f15516df478d5a56180fbf80e68f206010e6d160fc39fa508b65e035fd75130b"
- ],
- "version": "==6.7"
- },
- "crawllib": {
- "hashes": [
- "sha256:e24fa61cab91e0e7f461ff6173c2395fede3d4fbab565704622f5252028bdef3"
- ],
- "version": "==0.1.4"
- },
- "dawg": {
- "hashes": [
- "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5",
- "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953",
- "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e",
- "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8",
- "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3",
- "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55",
- "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9",
- "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171",
- "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9"
- ],
- "version": "==0.7.8"
- },
- "dill": {
- "hashes": [
- "sha256:97fd758f5fe742d42b11ec8318ecfcff8776bccacbfcec05dfd6276f5d450f73"
- ],
- "version": "==0.2.7.1"
- },
- "docutils": {
- "hashes": [
- "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6",
- "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274",
- "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6"
- ],
- "version": "==0.14"
- },
- "dogpile.cache": {
- "hashes": [
- "sha256:631197e78b4471bb0e93d0a86264c45736bc9ae43b4205d581dcc34fbe9b5f31"
- ],
- "version": "==0.6.5"
- },
- "elasticsearch": {
- "hashes": [
- "sha256:2e6dcdf9cb62377e0d6b82d2931ebc67b5a549081bbb531aa3be60ece33257b2",
- "sha256:8539e30c33fa0ad09617893a458056a35a654c92226390c30a07a3776bb8a94f"
- ],
- "version": "==5.5.2"
- },
- "flask": {
- "hashes": [
- "sha256:0749df235e3ff61ac108f69ac178c9770caeaccad2509cb762ce1f65570a8856",
- "sha256:49f44461237b69ecd901cc7ce66feea0319b9158743dd27a2899962ab214dac1"
- ],
- "version": "==0.12.2"
- },
- "future": {
- "hashes": [
- "sha256:e39ced1ab767b5936646cedba8bcce582398233d6a627067d4c6a454c90cfedb"
- ],
- "version": "==0.16.0"
- },
- "gapic-google-cloud-datastore-v1": {
- "hashes": [
- "sha256:d5d0e6f7b4996a8dd5f4f5ad27cd389a5d7bedd07e6e9b9f3a4a3c3ea56d9053"
- ],
- "version": "==0.15.3"
- },
- "gapic-google-cloud-error-reporting-v1beta1": {
- "hashes": [
- "sha256:39f3f8e974441fb57ea98d0be0f4cdedd18cda9a628aec31e1f0841affab6f16"
- ],
- "version": "==0.15.3"
- },
- "gapic-google-cloud-logging-v2": {
- "hashes": [
- "sha256:b0b87bf56c50e4617fb539e5c36f2e286ba87c67a9b1ef5c9ec854d6d7b10023"
- ],
- "version": "==0.91.3"
- },
- "globalwayback": {
- "hashes": [
- "sha256:74fb668e94528228d30040654fb5cc1cb7978da3982fa7e4e6bc36bafa4685be"
- ],
- "index": "ia",
- "version": "==0.3.10.9"
- },
- "google-api-core": {
- "hashes": [
- "sha256:0144d467083ed54d2e8ccb4212d42c3724fe0b844b7d3a0ff85aea54b7ae8347",
- "sha256:1698e4af4edd2a8ec22d678fd9abe99c926c3a154b2232f32807946ca49e3da1"
- ],
- "version": "==0.1.4"
- },
- "google-auth": {
- "hashes": [
- "sha256:34088434cb2a2409360b8f3cbc04195a465df1fb2aafad71ebbded77cbf08803",
- "sha256:9051802d3dae256036cca9e34633a32c0ed1427730d4ebc513dff91ec8b6dd45"
- ],
- "version": "==1.4.1"
- },
- "google-cloud": {
- "hashes": [
- "sha256:21a11ad63cc29cfd4e4b6426947bdc2cbefe080f87ad68fa432cf2ac108cb87d",
- "sha256:a99ee610e058c2af84ca1da710afe80fca022d2572c4bce78f6fdecac6b32d07"
- ],
- "version": "==0.32.0"
- },
- "google-cloud-bigquery": {
- "hashes": [
- "sha256:1c5a42332b43b58d6039fbca697a88c230487212afb6b4a6cbe0d5f3d5685785",
- "sha256:511f27e5e398f8bb4dcad914596e32fe5bcb111257f032d93956a2dcced4a00f"
- ],
- "version": "==0.28.0"
- },
- "google-cloud-bigquery-datatransfer": {
- "hashes": [
- "sha256:c0a290f9456b52c633cf966c73311038eb2d3fd5a61b40059edad966a434e4f9",
- "sha256:f5b5d0de43805fa9ebb620c58e1d27e6d32d2fc8e9a2f954ee170f7a026c8757"
- ],
- "version": "==0.1.1"
- },
- "google-cloud-bigtable": {
- "hashes": [
- "sha256:22c7148dc339be791d2bf4616c87e523507618570356148375c631de412f551b",
- "sha256:7f86d2c8d7f22670da98fd67bcdc17cab793f7400725c8148b05ad829c02b95e"
- ],
- "version": "==0.28.1"
- },
- "google-cloud-container": {
- "hashes": [
- "sha256:275de3032de5696cd976459966e911abd3a1b624c7a918a8137a0308898e4f90",
- "sha256:a89afcb1fe96bc9361c231c223c3bbe19fa3787caeb4697cd5778990e1077270"
- ],
- "version": "==0.1.1"
- },
- "google-cloud-core": {
- "hashes": [
- "sha256:0090df83dbc5cb2405fa90844366d13176d1c0b48181c1807ab15f53be403f73",
- "sha256:89e8140a288acec20c5e56159461d3afa4073570c9758c05d4e6cb7f2f8cc440"
- ],
- "version": "==0.28.1"
- },
- "google-cloud-dataproc": {
- "hashes": [
- "sha256:1096534affe51b2e54b7c57f429b83c4566f92ed42fdee6613a001649c61c3d5",
- "sha256:8537b0de721ec4524cb580572f44f88d2b22a1f5c56968448003ca9023ecb951"
- ],
- "version": "==0.1.0"
- },
- "google-cloud-datastore": {
- "hashes": [
- "sha256:3094b38df19a0a6663b614122d97efcfeab35baf770070e960f6792522307997",
- "sha256:7cc7a48945dd0ec8aea2d8a169f8a3dfd84608392b00b514b4421b5a716b3263"
- ],
- "version": "==1.4.0"
- },
- "google-cloud-dns": {
- "hashes": [
- "sha256:2edcb33ea0025545746962839373ff6d314a5fac779fd7f7ddd0e592196fbb14",
- "sha256:5db79c21a2ee72854609db88d5a4ae897c32bf77bf6b51b3290e013b2db49e07"
- ],
- "version": "==0.28.0"
- },
- "google-cloud-error-reporting": {
- "hashes": [
- "sha256:6398c5963599f834d2ba2effbca4f67dbd21034252d33d0f367d1a7c868cfb98",
- "sha256:d4bdc20b5704afddc12a464622153c250be62a9d38a57d0f9371a0a3f3e43c71"
- ],
- "version": "==0.28.0"
- },
- "google-cloud-firestore": {
- "hashes": [
- "sha256:91b1373faf1c763ce9cda915a82655918035e4d09e8d2399170122f94274c977",
- "sha256:c12fe5a017a56c8d617bdf83ace3970afba886c09fc0d6c0d24f36f8644dc178"
- ],
- "version": "==0.28.0"
- },
- "google-cloud-language": {
- "hashes": [
- "sha256:59e7c3ed0dcf54d17943cad29ececb41adbbfc1b56c9ba278c13fd17aea54188",
- "sha256:b5c3073697b9c51b8f1a4a619f7b3b6744e88efe0e235958ff411ee17e825c61"
- ],
- "version": "==1.0.1"
- },
- "google-cloud-logging": {
- "hashes": [
- "sha256:81ff672f3b4c6f2a182282aae68f7c3dafbbfc8dc7dcc0db8dc90cd288adc3b8",
- "sha256:a1567aa858fbe223973c698f12198e5828fa66f3d42dac27e81b246ff385cd70"
- ],
- "version": "==1.4.0"
- },
- "google-cloud-monitoring": {
- "hashes": [
- "sha256:534d66d97611c9c6e08823532f5144f6786d3a6103a6d5ed6411ac465faa5341",
- "sha256:c7a336a469cf186e3621fe36bf0e4f291ea43a76b4a532b744a167d756a8170d"
- ],
- "version": "==0.28.1"
- },
- "google-cloud-pubsub": {
- "hashes": [
- "sha256:059dcd1862e07a47920474b0fe38d40508d8c4881929f17696a3aba834b11027",
- "sha256:ff6e74390dfa097e3cee7d5f323aeed37a23dfb1faa762d9efced09f972415a4"
- ],
- "version": "==0.30.1"
- },
- "google-cloud-resource-manager": {
- "hashes": [
- "sha256:ed4eaab40164614db77496986fb2900d93cb1daa7e1a23fd20be7bd46705a8f3",
- "sha256:fc29c11dcbe9208261d377185a1ae5331bab43f2a592222a25c8aca9c8031308"
- ],
- "version": "==0.28.1"
- },
- "google-cloud-runtimeconfig": {
- "hashes": [
- "sha256:095a1dab0d7b8e02dff06bee7844bb283a1c2c6f2e7333eb9896419a30143d82",
- "sha256:f441fbc22e2d0871ecb390854aa352cf467d2751cbc0dac7578274ead813519e"
- ],
- "version": "==0.28.1"
- },
- "google-cloud-spanner": {
- "hashes": [
- "sha256:39707e37e2399df0a6ea7f5b636fc06858359a70af92ac485bbba9bf810a43e7",
- "sha256:bc4c6269b07d0982289d4ecd9b88da943e097ab2104c79ce18e4966766bb877b"
- ],
- "version": "==0.29.0"
- },
- "google-cloud-speech": {
- "hashes": [
- "sha256:76e42ffdea4bb9790cac34d5a4cdb2e2d47a0e254a27dbbaf40fbd10fdb04e31",
- "sha256:ea920710b2fa74175bacc1f90a2eb326038eeaf3ce364c29a40e7eb70c7c7132"
- ],
- "version": "==0.30.0"
- },
- "google-cloud-storage": {
- "hashes": [
- "sha256:5db85fa905f85377c5ea9ea0fb4c5602343d2ecc09395aa2efbf9a21d27233fc",
- "sha256:939266b7d5c6df6d45a1aee2c47a09313f813e87790335f00c57708b49480054"
- ],
- "version": "==1.6.0"
- },
- "google-cloud-trace": {
- "hashes": [
- "sha256:b45bc7934ee459e83f1a4b04ba7a0de1be9fe004bcf494b9ff2f75e18414642c",
- "sha256:c926f55c141caf9fd0e29991caaba03a6364fe997384e2f48c34051f0f690186"
- ],
- "version": "==0.17.0"
- },
- "google-cloud-translate": {
- "hashes": [
- "sha256:4420f5b320145bf097ca9a12b18ec27c067886e2832d181f268c46c3bcb0d2e4",
- "sha256:7706814d8ee1be9668b5c204852d44e7f6c51d44a2822df772b7b2fa3fb7c6bb"
- ],
- "version": "==1.3.1"
- },
- "google-cloud-videointelligence": {
- "hashes": [
- "sha256:5c18e64bf41fc5f8203d3b4f7a88921ff8d6a09b8ce343da866274fa375dcb82",
- "sha256:d46b94f8d37da565a90fb282a4053ee1379b4cbe1e2b51dfef75444fd5422e5d"
- ],
- "version": "==1.0.1"
- },
- "google-cloud-vision": {
- "hashes": [
- "sha256:9442d3456935654920f002efa846271dc68b19f5a0b3e2c314a9edd9f34f30a3",
- "sha256:a4cd3f64ac2c2586a56ea00d27c58306465e3277873cc9bfce3677e9ccb2b039"
- ],
- "version": "==0.29.0"
- },
- "google-gax": {
- "hashes": [
- "sha256:518e8d5eb90774af2041080d242f4bcec4c6e653226c693901eaf82eda8a395c",
- "sha256:bc60cdfed5c657ea8542ba8102fb43862a1c4809d98ca5aa1fcc92f9d4f7403d"
- ],
- "version": "==0.15.16"
- },
- "google-resumable-media": {
- "hashes": [
- "sha256:116de90b9cd483b17c53618ee6a5a20f33e741c648140c8cc9c2141e07616ff1",
- "sha256:97de518f8166d442cc0b61fab308bcd319dbb970981e667ec8ded44f5ce49836"
- ],
- "version": "==0.3.1"
- },
- "googleapis-common-protos": {
- "hashes": [
- "sha256:c075eddaa2628ab519e01b7d75b76e66c40eaa50fc52758d8225f84708950ef2"
- ],
- "version": "==1.5.3"
- },
- "grpc-google-iam-v1": {
- "hashes": [
- "sha256:5009e831dcec22f3ff00e89405249d6a838d1449a46ac8224907aa5b0e0b1aec"
- ],
- "version": "==0.11.4"
- },
- "grpcio": {
- "hashes": [
- "sha256:03522ca2b80cc83c407d6d5fa0f532fc5b0160f5861ab6628ae418218022f339",
- "sha256:0c07fc5a9fa73832bca228d9004dc0a511d1f1a890c8ff0fb6e78f7fa1b96d58",
- "sha256:5487948a33ded0577035fb9ad5c542463fabdb356d27fb6cf3eed0bed78f5228",
- "sha256:592b14bab2cf8a5ac6e0b1e763b10828dda55ae74e4a476102897a25d033d0b8",
- "sha256:6727633444344e14d499f81e41e87de29f42653a0c048c0fce780b29e2ffe563",
- "sha256:7aaa523dd5e91acb61d41232eef96192a66b1e8d85d79a55a204093dce50b696",
- "sha256:7e81d2cd63fbc77782b140d728a5444c09a7ef1ea99957c490b722e416671921",
- "sha256:89b96cb00db3e4c2f612ddff81b27afd2406bf697f0654dcd5ba7bed3d5b9322",
- "sha256:a26d32bdc1479c3983194ad380e4cc4bad379729fac4d4acbbda209658d65610",
- "sha256:b092361aed6a5ab201e750ccc2fc59250448107cb958c63194455fd5161f3dcc",
- "sha256:bef5ce5cfe622830da8b92420748e7f5719d208198d750eb497a74fa20da5907",
- "sha256:c0232401d096d9beb06d1af42cf3d65bcc790557712c8b0226809c27401dd455",
- "sha256:cf6e88dc0fc06e446c689304bae6cc15df0517f02ee2ac52bcef0cb893b3e9ba",
- "sha256:cfc2595325ac3bbd170293050a3ff0b5c8e3846d8498d9b2dedcd3063460c007",
- "sha256:d4d4eb072fa2c44a8fe22475874c13c2cc786dba587f633cd47720fb8fec922e",
- "sha256:d78d29295723493327c68aa39407ec44ed7a4ed326dde413cce57e2409683f68",
- "sha256:d829735b1862687074ef9df15f6c4fc1f3b27c0c80d4890ef985c039cdb256c6",
- "sha256:db2430e38a268e64c980e34a62b7f50365a4401172a2f5202ad05eb156e418da",
- "sha256:df3f75f88431da28b1c60a043f2a4f2e359287249baf317f4eda20626b34ce5d",
- "sha256:e1a648519db0de7bab3583dc2471f4c12ee3fc8a8c459e75da9e05ae70cf9ce7",
- "sha256:e35e4c0e5fe5f0fd8914a1630443f7272a752fb9d39c96771799a32056b06ab5",
- "sha256:e9b09fbae7bcc1b1791168665c29bcf37253f54d9d5e6bc8a533a9917912f1f6",
- "sha256:e9b9e67059c81f5b232991c573601856065562f010b655b3afd7ab38459ac321",
- "sha256:e9cf01192ed663a277b64bc559066b07a1355cd85b74cbca9353a25e8bd45544",
- "sha256:ef886f8c845ce7c011b2c154fd2b7998f920d145a2dfa121459e8df5b562a879",
- "sha256:efcf5ea73af68e110c375173495f36d9ed1e50bc88eeae1314f5573f750eecb9",
- "sha256:fc4deee5b4402c4bd37b3c7dbcde8903be89b9026af0f5777f63ae24069cafec"
- ],
- "version": "==1.11.0"
- },
- "happybase": {
- "hashes": [
- "sha256:e20376e2e32291798d2226502994134c1c4e175136d8375b3c517a234fa22481"
- ],
- "index": "ia",
- "version": "==1.1.0"
- },
- "httplib2": {
- "hashes": [
- "sha256:e71daed9a0e6373642db61166fa70beecc9bf04383477f84671348c02a04cbdf"
- ],
- "version": "==0.11.3"
- },
- "ialib": {
- "hashes": [
- "sha256:6ec318ec7ce6aeb326984bb259dcbe0c0907e648b41ad6c0bba1a7c6eb285ef2"
- ],
- "version": "==0.1.1"
- },
- "idna": {
- "hashes": [
- "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
- "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
- ],
- "version": "==2.6"
- },
- "itsdangerous": {
- "hashes": [
- "sha256:cbb3fcf8d3e33df861709ecaf89d9e6629cff0a217bc2848f1b41cd30d360519"
- ],
- "version": "==0.24"
- },
- "jinja2": {
- "hashes": [
- "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd",
- "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4"
- ],
- "version": "==2.10"
- },
- "jmespath": {
- "hashes": [
- "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64",
- "sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63"
- ],
- "version": "==0.9.3"
- },
- "markupsafe": {
- "hashes": [
- "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665"
- ],
- "version": "==1.0"
- },
- "mrjob": {
- "hashes": [
- "sha256:669a98afe53f4e4ad9373445f91a8c26e86e30f1acc50c7fd274a42e1ea846d8",
- "sha256:6ec1aca9b376c98249a324dd7837cc76e676d48ba10b2b6cbef246eba0ee52e9"
- ],
- "index": "ia",
- "version": "==0.6.2"
- },
- "oauth2client": {
- "hashes": [
- "sha256:5b5b056ec6f2304e7920b632885bd157fa71d1a7f3ddd00a43b1541a8d1a2460"
- ],
- "version": "==3.0.0"
- },
- "pillow": {
- "hashes": [
- "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8",
- "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b",
- "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945",
- "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee",
- "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465",
- "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe",
- "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2",
- "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964",
- "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061",
- "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b",
- "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608",
- "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
- "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
- "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c",
- "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
- "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8",
- "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
- "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e",
- "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f",
- "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51",
- "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3",
- "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2",
- "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
- "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05",
- "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b",
- "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0",
- "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64",
- "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7",
- "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842",
- "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2",
- "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a",
- "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f",
- "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3",
- "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b",
- "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c",
- "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d",
- "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984",
- "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f",
- "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3",
- "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c",
- "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110",
- "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb",
- "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0",
- "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02",
- "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
- "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
- ],
- "version": "==3.1.1"
- },
- "ply": {
- "hashes": [
- "sha256:e7d1bdff026beb159c9942f7a17e102c375638d9478a7ecd4cc0c76afd8de0b8"
- ],
- "version": "==3.8"
- },
- "proto-google-cloud-datastore-v1": {
- "hashes": [
- "sha256:a431bb6a286107900c9ce3c48d316378867293c50a4d8a6c7393264600e916f9"
- ],
- "version": "==0.90.4"
- },
- "proto-google-cloud-error-reporting-v1beta1": {
- "hashes": [
- "sha256:526f49ae84dec16aa4740415c86a919d48206001d116d12df5badc5b2b051b82"
- ],
- "version": "==0.15.3"
- },
- "proto-google-cloud-logging-v2": {
- "hashes": [
- "sha256:e0538745f0f33760ffd5116f33385113835a3b2042a6c914923be0539e5280ce"
- ],
- "version": "==0.91.3"
- },
- "protobuf": {
- "hashes": [
- "sha256:01ccd6d03449ae75b779fb5bf4ed62177d61afe3c5e6465ccf3f8b2e1a84afbe",
- "sha256:1d92cc30b0b46cced33adde5853d920179eb5ea8eecdee9552502a7f29cc3f21",
- "sha256:242e4c7ae565267a8bc8b92d707177f915607ea4bd73244bec6cbf4a49b96661",
- "sha256:3b60685732bd0cbdc802dfcb6071efbcf5d927ce3127c13c33ea1a8efae3aa76",
- "sha256:3f655e1f99c3e14d56ca900af1b9a4715b691319a295cc38939d7f77eabd5e7c",
- "sha256:560a38e692a69957a70ba0e5839aa67430efd63072bf91b0539dac19055694cd",
- "sha256:5c1c8f6a0a68a874e3beff89255959dd80fad45870e96c88944a1b81a22dd5f5",
- "sha256:628a3bf0794a8b3cabb18db11eb67cc10e0cc6e5525d557ae7b682bb73fa2018",
- "sha256:7222d6616108b33ad6cbeff8117062a73c43cdc8fa8f64f6a322ebeb663e710e",
- "sha256:76ef6ca3c50e4cfd044861586d5f1b352e0fe7f17f883df6c165bad5b4d0e10a",
- "sha256:7c193e6964e752bd056735594826c5b03274ceb8f07349d3ae47d9766250ba96",
- "sha256:869e12bcfb5759e683f53ec1dd6155b7be034065431da289f0cb4510040a0799",
- "sha256:905414e5ea6cdb78d8730f66335755152b46685fcb9fc2f2134024e3ea9e8dcc",
- "sha256:ac0067e3c60737865ed72bb7416e02297d229d960902802d874c0e167128c809",
- "sha256:adf716a89c9cc1891ead79a861c427071ef59172f0e11967b00565a9547b3bd0",
- "sha256:bcfa99f5a82f5eaaf6e5cee5bfdca5a1670f5740aec1d93dae170645ed1a16b0",
- "sha256:cc94079ae6cbcea5ae194464a30f3223f075e06a0446f52bca9ddbeb6e9f412a",
- "sha256:d5d9edfdc5a3a01d06062d677b121081629782edf0e05ca1be14f15bb947eeee",
- "sha256:e269ab7a50bf0fa6fe6a88ea7dcc7a1079ae9450d9ab9b7730ac32916d55508b",
- "sha256:e7fd33a3474cbe18fd5b5620784a0fa21fcae3e402b1806e29c6b450c7f61706"
- ],
- "version": "==3.5.2.post1"
- },
- "psutil": {
- "hashes": [
- "sha256:325c334596ad2d8a178d0e7b4eecc91748096a87489b3701ee16986173000aaa",
- "sha256:33384065f0014351fa70187548e3e95952c4df4bc5c38648bd0e647d21eaaf01",
- "sha256:51e12aa74509832443862373a2655052b20c83cad7322f49d217452500b9a405",
- "sha256:52a91ba928a5e86e0249b4932d6e36972a72d1ad8dcc5b7f753a2ae14825a4ba",
- "sha256:99029b6af386b22882f0b6d537ffed5a9c3d5ff31782974aeaa1d683262d8543",
- "sha256:b10703a109cc9225cd588c207f7f93480a420ade35c13515ea8f20063b42a392",
- "sha256:ddba952ed256151844d82fb13c8fb1019fe11ecaeacbd659d67ba5661ae73d0d",
- "sha256:ebe293be36bb24b95cdefc5131635496e88b17fabbcf1e4bc9b5c01f5e489cfe",
- "sha256:f24cd52bafa06917935fe1b68c5a45593abe1f3097dc35b2dfc4718236795890"
- ],
- "version": "==5.4.5"
- },
- "publicsuffix": {
- "hashes": [
- "sha256:99a3a06d6eb19c57057d17560908b757995396ad76e6513c9d17e6a7a1266c91",
- "sha256:ae77593d269e1e5131723259cc1142c25690c20c59f2e98f67e227228028bda9"
- ],
- "version": "==1.1.0"
- },
- "pyasn1": {
- "hashes": [
- "sha256:0d7f6e959fe53f3960a23d73f35e1fce61348b30915b6664309ca756de7c1f89",
- "sha256:5a0db897b311d265cde49615cf783f1c78613138605cdd0f907ecfa5b2aba3ee",
- "sha256:758cb50abddc03e4563fd9e7f03db56e3e87b58c0bd01247360326e5c0c7ffa5",
- "sha256:7d626683e3d792cccc608da02498aff37ab4f3dafd8905d6bf755d11f9b26b43",
- "sha256:a7efe807c4b83a859e2735c692b92ed7b567cfddc4163763412920041d876c2b",
- "sha256:b5a9ca48055b9a20f6d1b3d68e38692e5431c86a0f99ea602e61294e891fee5b",
- "sha256:c07d6e587b2f928366b1f67c09bda026a3e6fcc99e80a744dc67f8fca3895626",
- "sha256:d258b0a71994f7770599835249cece1caef3c70def868c4915e6e5ca49b67d15",
- "sha256:d5cd6ed995dba16fad0c521cfe31cd2d68400b53fcc2bce93326829be73ab6d1",
- "sha256:d84c2aea3cf43780e9e6a19f4e4dddee9f6976519020e64e47c57e5c7a8c3dd2",
- "sha256:e85895087905c65b5b594eb91f7522664c85545b147d5f4d4e7b1b07da8dcbdc",
- "sha256:f81c96761fca60d64b1c9b79ec2e40cf9495a745cf570613079ef324aeb9672b"
- ],
- "version": "==0.4.2"
- },
- "pyasn1-modules": {
- "hashes": [
- "sha256:041e9fbafac548d095f5b6c3b328b80792f006196e15a232b731a83c93d59493",
- "sha256:0cdca76a68dcb701fff58c397de0ef9922b472b1cb3ea9695ca19d03f1869787",
- "sha256:0cea139045c38f84abaa803bcb4b5e8775ea12a42af10019d942f227acc426c3",
- "sha256:0f2e50d20bc670be170966638fa0ae603f0bc9ed6ebe8e97a6d1d4cef30cc889",
- "sha256:47fb6757ab78fe966e7c58b2030b546854f78416d653163f0ce9290cf2278e8b",
- "sha256:598a6004ec26a8ab40a39ea955068cf2a3949ad9c0030da970f2e1ca4c9f1cc9",
- "sha256:72fd8b0c11191da088147c6e4678ec53e573923ecf60b57eeac9e97433e09fc2",
- "sha256:854700bbdd01394e2ada9c1bfbd0ed9f5d0c551350dbbd023e88b11d2771ae06",
- "sha256:af00ea8f2022b6287dc375b2c70f31ab5af83989fc6fe9eacd4976ce26cd7ccc",
- "sha256:b1f395cae2d669e0830cb023aa86f9f283b7a9aa32317d7f80d8e78aa2745812",
- "sha256:c6747146e95d2b14cc2a8399b2b0bde3f93778f8f9ec704690d2b589c376c137",
- "sha256:f53fe5bcebdf318f51399b250fe8325ef3a26d927f012cc0c8e0f9e9af7f9deb"
- ],
- "version": "==0.2.1"
- },
- "pylru": {
- "hashes": [
- "sha256:71376192671f0ad1690b2a7427d39a29b1df994c8469a9b46b03ed7e28c0172c"
- ],
- "version": "==1.0.9"
- },
- "pymysql": {
- "hashes": [
- "sha256:04fa19fad017fdb21394fad2878c1d6bd346959d4fbfd1b66050a09fc636a321",
- "sha256:32da4a66397077d42908e449688f2ec71c2b18892a6cd04f03ab2aa828a70f40"
- ],
- "version": "==0.8.0"
- },
- "python-dateutil": {
- "hashes": [
- "sha256:891c38b2a02f5bb1be3e4793866c8df49c7d19baabf9c1bad62547e0b4866aca",
- "sha256:95511bae634d69bc7329ba55e646499a842bc4ec342ad54a8cdb65645a0aad3c"
- ],
- "version": "==2.6.1"
- },
- "pytz": {
- "hashes": [
- "sha256:65ae0c8101309c45772196b21b74c46b2e5d11b6275c45d251b150d5da334555",
- "sha256:c06425302f2cf668f1bba7a0a03f3c1d34d4ebeef2c72003da308b3947c7f749"
- ],
- "version": "==2018.4"
- },
- "pyyaml": {
- "hashes": [
- "sha256:0c507b7f74b3d2dd4d1322ec8a94794927305ab4cebbe89cc47fe5e81541e6e8",
- "sha256:0d8116e53b63bd0c50335d5cf151b0001bd8152e540365c515d77ed067964ffd",
- "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736",
- "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f",
- "sha256:326420cbb492172dec84b0f65c80942de6cedb5233c413dd824483989c000608",
- "sha256:4474f8ea030b5127225b8894d626bb66c01cda098d47a2b0d3429b6700af9fd8",
- "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab",
- "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7",
- "sha256:5f84523c076ad14ff5e6c037fe1c89a7f73a3e04cf0377cb4d017014976433f3",
- "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1",
- "sha256:b4c423ab23291d3945ac61346feeb9a0dc4184999ede5e7c43e1ffb975130ae6",
- "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8",
- "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4",
- "sha256:ca233c64c6e40eaa6c66ef97058cdc80e8d0157a443655baa1b2966e812807ca",
- "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269"
- ],
- "version": "==3.12"
- },
- "raven": {
- "hashes": [
- "sha256:738a52019d01955d5b44b49d67c9f2f4cedb1b4f70d4fb0b493931174d00e044",
- "sha256:92bf4c4819472ed20f1b9905eeeafe1bc6fe5f273d7c14506fdb8fb3a6ab2074"
- ],
- "index": "ia",
- "version": "==6.6.0"
- },
- "redis": {
- "hashes": [
- "sha256:8a1900a9f2a0a44ecf6e8b5eb3e967a9909dfed219ad66df094f27f7d6f330fb",
- "sha256:a22ca993cea2962dbb588f9f30d0015ac4afcc45bee27d3978c0dbe9e97c6c0f"
- ],
- "version": "==2.10.6"
- },
- "requests": {
- "hashes": [
- "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
- "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
- ],
- "index": "ia",
- "version": "==2.18.4"
- },
- "requests-file": {
- "hashes": [
- "sha256:75c175eed739270aec3c5279ffd74e6527dada275c5c0d76b5817e9c86bb7dea",
- "sha256:8f04aa6201bacda0567e7ac7f677f1499b0fc76b22140c54bc06edf1ba92e2fa"
- ],
- "version": "==1.4.3"
- },
- "robotexclusionrulesparser": {
- "hashes": [
- "sha256:d23aa14ae8145c13c95612d696736bad52a4bd0819ce8c9437ee745098fb8388"
- ],
- "version": "==1.7.1"
- },
- "rsa": {
- "hashes": [
- "sha256:25df4e10c263fb88b5ace923dd84bf9aa7f5019687b5e55382ffcdb8bede9db5",
- "sha256:43f682fea81c452c98d09fc316aae12de6d30c4b5c84226642cf8f8fd1c93abd"
- ],
- "version": "==3.4.2"
- },
- "s3transfer": {
- "hashes": [
- "sha256:90dc18e028989c609146e241ea153250be451e05ecc0c2832565231dacdf59c1",
- "sha256:c7a9ec356982d5e9ab2d4b46391a7d6a950e2b04c472419f5fdec70cc0ada72f"
- ],
- "version": "==0.1.13"
- },
- "schedule": {
- "hashes": [
- "sha256:1003a07c2dce12828c25a03a611a7371cedfa956e5f1b4abc32bcc94eb5a335b",
- "sha256:a24e75fc5e5acbd204049d55329e39a2a9a3479bca2e34c7fde81386c9d8d2fa"
- ],
- "version": "==0.5.0"
- },
- "six": {
- "hashes": [
- "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
- "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
- ],
- "version": "==1.11.0"
- },
- "sqlalchemy": {
- "hashes": [
- "sha256:7cb00cc9b9f92ef8b4391c8a2051f81eeafefe32d63c6b395fd51401e9a39edb"
- ],
- "version": "==1.2.6"
- },
- "surt": {
- "hashes": [
- "sha256:1e6a2f3e626b45d6c2aa278f7d5b176e8a06586870453045e6f5f2cfe5cc62a3"
- ],
- "version": "==0.3.0"
- },
- "thriftpy": {
- "hashes": [
- "sha256:309e57d97b5bfa01601393ad4f245451e989d6206a59279e56866b264a99796d",
- "sha256:6baceabd40f0934186ebcfd1f559d34a9f165b65ac5d396a39ef7f61e44d9156"
- ],
- "version": "==0.3.9"
- },
- "tldextract": {
- "hashes": [
- "sha256:29797125db1f2e72ce2ee51f7a764ec8b1e6588812520795ffeae93bcd46bab4",
- "sha256:84a0b275c262e34df7506e10767e357e8b5a755a3a620cdc2cfe035061f7806d"
- ],
- "version": "==2.2.0"
- },
- "twitter": {
- "hashes": [
- "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
- "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
- ],
- "version": "==1.18.0"
- },
- "urllib3": {
- "hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
- ],
- "version": "==1.22"
- },
- "warctools": {
- "hashes": [
- "sha256:ce0c6e274db8ac8810f7c97b3943e8e8deadbc3f5c982db77cddaae2d2ae6170"
- ],
- "version": "==4.10.0"
- },
- "wayback": {
- "hashes": [
- "sha256:4ba5d6a48de9b73171df6c2c0378954fd08a2b55b7fecb9224dd48a87281efbe"
- ],
- "index": "ia",
- "version": "==0.2.7.4"
- },
- "wayback-esp": {
- "hashes": [
- "sha256:e24a692f805adc87124cedccafaa37b450ab47b5bda409beae57bcf57ef77f89"
- ],
- "version": "==0.1.2.1"
- },
- "wayback-search-js": {
- "hashes": [
- "sha256:afe198a56ca86eb0e6e78e54ad1efaa16976163f153e6e45f6c44c1d280889db"
- ],
- "version": "==1.4.11"
- },
- "wbex-client": {
- "hashes": [
- "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
- ],
- "version": "==0.1.5"
- },
- "werkzeug": {
- "hashes": [
- "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c",
- "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b"
- ],
- "version": "==0.14.1"
- },
- "xmltodict": {
- "hashes": [
- "sha256:8f8d7d40aa28d83f4109a7e8aa86e67a4df202d9538be40c0cb1d70da527b0df",
- "sha256:add07d92089ff611badec526912747cf87afd4f9447af6661aca074eeaf32615"
- ],
- "index": "ia",
- "version": "==0.11.0"
- }
- },
- "develop": {
- "astroid": {
- "hashes": [
- "sha256:35cfae47aac19c7b407b7095410e895e836f2285ccf1220336afba744cc4c5f2",
- "sha256:38186e481b65877fd8b1f9acc33e922109e983eb7b6e487bd4c71002134ad331"
- ],
- "version": "==1.6.3"
- },
- "attrs": {
- "hashes": [
- "sha256:1c7960ccfd6a005cd9f7ba884e6316b5e430a3f1a6c37c5f87d8b43f83b54ec9",
- "sha256:a17a9573a6f475c99b551c0e0a812707ddda1ec9653bed04c13841404ed6f450"
- ],
- "version": "==17.4.0"
- },
- "backcall": {
- "hashes": [
- "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
- "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
- ],
- "version": "==0.1.0"
- },
- "certifi": {
- "hashes": [
- "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7",
- "sha256:9fa520c1bacfb634fa7af20a76bcbd3d5fb390481724c597da32c719a7dca4b0"
- ],
- "version": "==2018.4.16"
- },
- "chardet": {
- "hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
- ],
- "version": "==3.0.4"
- },
- "cookies": {
- "hashes": [
- "sha256:15bee753002dff684987b8df8c235288eb8d45f8191ae056254812dfd42c81d3",
- "sha256:d6b698788cae4cfa4e62ef8643a9ca332b79bd96cb314294b864ae8d7eb3ee8e"
- ],
- "version": "==2.2.1"
- },
- "coverage": {
- "hashes": [
- "sha256:03481e81d558d30d230bc12999e3edffe392d244349a90f4ef9b88425fac74ba",
- "sha256:0b136648de27201056c1869a6c0d4e23f464750fd9a9ba9750b8336a244429ed",
- "sha256:104ab3934abaf5be871a583541e8829d6c19ce7bde2923b2751e0d3ca44db60a",
- "sha256:15b111b6a0f46ee1a485414a52a7ad1d703bdf984e9ed3c288a4414d3871dcbd",
- "sha256:198626739a79b09fa0a2f06e083ffd12eb55449b5f8bfdbeed1df4910b2ca640",
- "sha256:1c383d2ef13ade2acc636556fd544dba6e14fa30755f26812f54300e401f98f2",
- "sha256:28b2191e7283f4f3568962e373b47ef7f0392993bb6660d079c62bd50fe9d162",
- "sha256:2eb564bbf7816a9d68dd3369a510be3327f1c618d2357fa6b1216994c2e3d508",
- "sha256:337ded681dd2ef9ca04ef5d93cfc87e52e09db2594c296b4a0a3662cb1b41249",
- "sha256:3a2184c6d797a125dca8367878d3b9a178b6fdd05fdc2d35d758c3006a1cd694",
- "sha256:3c79a6f7b95751cdebcd9037e4d06f8d5a9b60e4ed0cd231342aa8ad7124882a",
- "sha256:3d72c20bd105022d29b14a7d628462ebdc61de2f303322c0212a054352f3b287",
- "sha256:3eb42bf89a6be7deb64116dd1cc4b08171734d721e7a7e57ad64cc4ef29ed2f1",
- "sha256:4635a184d0bbe537aa185a34193898eee409332a8ccb27eea36f262566585000",
- "sha256:56e448f051a201c5ebbaa86a5efd0ca90d327204d8b059ab25ad0f35fbfd79f1",
- "sha256:5a13ea7911ff5e1796b6d5e4fbbf6952381a611209b736d48e675c2756f3f74e",
- "sha256:69bf008a06b76619d3c3f3b1983f5145c75a305a0fea513aca094cae5c40a8f5",
- "sha256:6bc583dc18d5979dc0f6cec26a8603129de0304d5ae1f17e57a12834e7235062",
- "sha256:701cd6093d63e6b8ad7009d8a92425428bc4d6e7ab8d75efbb665c806c1d79ba",
- "sha256:7608a3dd5d73cb06c531b8925e0ef8d3de31fed2544a7de6c63960a1e73ea4bc",
- "sha256:76ecd006d1d8f739430ec50cc872889af1f9c1b6b8f48e29941814b09b0fd3cc",
- "sha256:7aa36d2b844a3e4a4b356708d79fd2c260281a7390d678a10b91ca595ddc9e99",
- "sha256:7d3f553904b0c5c016d1dad058a7554c7ac4c91a789fca496e7d8347ad040653",
- "sha256:7e1fe19bd6dce69d9fd159d8e4a80a8f52101380d5d3a4d374b6d3eae0e5de9c",
- "sha256:8c3cb8c35ec4d9506979b4cf90ee9918bc2e49f84189d9bf5c36c0c1119c6558",
- "sha256:9d6dd10d49e01571bf6e147d3b505141ffc093a06756c60b053a859cb2128b1f",
- "sha256:9e112fcbe0148a6fa4f0a02e8d58e94470fc6cb82a5481618fea901699bf34c4",
- "sha256:ac4fef68da01116a5c117eba4dd46f2e06847a497de5ed1d64bb99a5fda1ef91",
- "sha256:b8815995e050764c8610dbc82641807d196927c3dbed207f0a079833ffcf588d",
- "sha256:be6cfcd8053d13f5f5eeb284aa8a814220c3da1b0078fa859011c7fffd86dab9",
- "sha256:c1bb572fab8208c400adaf06a8133ac0712179a334c09224fb11393e920abcdd",
- "sha256:de4418dadaa1c01d497e539210cb6baa015965526ff5afc078c57ca69160108d",
- "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
- "sha256:e4d96c07229f58cb686120f168276e434660e4358cc9cf3b0464210b04913e77",
- "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80",
- "sha256:f8a923a85cb099422ad5a2e345fe877bbc89a8a8b23235824a93488150e45f6e"
- ],
- "version": "==4.5.1"
- },
- "decorator": {
- "hashes": [
- "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82",
- "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c"
- ],
- "version": "==4.3.0"
- },
- "happybase-mock": {
- "hashes": [
- "sha256:327203ff63171a83c9fab34b249636b6a55550041273d2acddc0723433bdf260",
- "sha256:bd4583551f40e8b7f622ffd462f8e7ed1d34d14d73fa1758f0a5f413b1949f50"
- ],
- "index": "ia",
- "version": "==0.9.0"
- },
- "idna": {
- "hashes": [
- "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
- "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
- ],
- "version": "==2.6"
- },
- "ipython": {
- "hashes": [
- "sha256:85882f97d75122ff8cdfe129215a408085a26039527110c8d4a2b8a5e45b7639",
- "sha256:a6ac981381b3f5f604b37a293369963485200e3639fb0404fa76092383c10c41"
- ],
- "index": "ia",
- "version": "==6.3.1"
- },
- "ipython-genutils": {
- "hashes": [
- "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
- "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
- ],
- "version": "==0.2.0"
- },
- "isort": {
- "hashes": [
- "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
- "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8",
- "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"
- ],
- "version": "==4.3.4"
- },
- "jedi": {
- "hashes": [
- "sha256:1972f694c6bc66a2fac8718299e2ab73011d653a6d8059790c3476d2353b99ad",
- "sha256:5861f6dc0c16e024cbb0044999f9cf8013b292c05f287df06d3d991a87a4eb89"
- ],
- "version": "==0.12.0"
- },
- "lazy-object-proxy": {
- "hashes": [
- "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
- "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
- "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
- "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
- "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
- "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
- "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
- "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
- "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
- "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
- "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
- "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
- "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
- "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
- "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
- "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
- "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
- "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
- "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
- "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
- "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
- "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
- "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
- "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
- "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
- "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
- "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
- "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a",
- "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b"
- ],
- "version": "==1.3.1"
- },
- "mccabe": {
- "hashes": [
- "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
- "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
- ],
- "version": "==0.6.1"
- },
- "more-itertools": {
- "hashes": [
- "sha256:0dd8f72eeab0d2c3bd489025bb2f6a1b8342f9b198f6fc37b52d15cfa4531fea",
- "sha256:11a625025954c20145b37ff6309cd54e39ca94f72f6bb9576d1195db6fa2442e",
- "sha256:c9ce7eccdcb901a2c75d326ea134e0886abfbea5f93e91cc95de9507c0816c44"
- ],
- "version": "==4.1.0"
- },
- "parso": {
- "hashes": [
- "sha256:62bd6bf7f04ab5c817704ff513ef175328676471bdef3629d4bdd46626f75551",
- "sha256:a75a304d7090d2c67bd298091c14ef9d3d560e3c53de1c239617889f61d1d307"
- ],
- "version": "==0.2.0"
- },
- "pexpect": {
- "hashes": [
- "sha256:9783f4644a3ef8528a6f20374eeb434431a650c797ca6d8df0d81e30fffdfa24",
- "sha256:9f8eb3277716a01faafaba553d629d3d60a1a624c7cf45daa600d2148c30020c"
- ],
- "markers": "sys_platform != 'win32'",
- "version": "==4.5.0"
- },
- "pickleshare": {
- "hashes": [
- "sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b",
- "sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5"
- ],
- "version": "==0.7.4"
- },
- "pluggy": {
- "hashes": [
- "sha256:7f8ae7f5bdf75671a718d2daf0a64b7885f74510bcd98b1a0bb420eb9a9d0cff",
- "sha256:d345c8fe681115900d6da8d048ba67c25df42973bda370783cd58826442dcd7c",
- "sha256:e160a7fcf25762bb60efc7e171d4497ff1d8d2d75a3d0df7a21b76821ecbf5c5"
- ],
- "version": "==0.6.0"
- },
- "prompt-toolkit": {
- "hashes": [
- "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
- "sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4",
- "sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917"
- ],
- "version": "==1.0.15"
- },
- "ptyprocess": {
- "hashes": [
- "sha256:e64193f0047ad603b71f202332ab5527c5e52aa7c8b609704fc28c0dc20c4365",
- "sha256:e8c43b5eee76b2083a9badde89fd1bbce6c8942d1045146e100b7b5e014f4f1a"
- ],
- "version": "==0.5.2"
- },
- "py": {
- "hashes": [
- "sha256:29c9fab495d7528e80ba1e343b958684f4ace687327e6f789a94bf3d1915f881",
- "sha256:983f77f3331356039fdd792e9220b7b8ee1aa6bd2b25f567a963ff1de5a64f6a"
- ],
- "version": "==1.5.3"
- },
- "pygments": {
- "hashes": [
- "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
- "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
- ],
- "version": "==2.2.0"
- },
- "pylint": {
- "hashes": [
- "sha256:0b7e6b5d9f1d4e0b554b5d948f14ed7969e8cdf9a0120853e6e5af60813b18ab",
- "sha256:34738a82ab33cbd3bb6cd4cef823dbcabdd2b6b48a4e3a3054a2bbbf0c712be9"
- ],
- "index": "ia",
- "version": "==1.8.4"
- },
- "pytest": {
- "hashes": [
- "sha256:6266f87ab64692112e5477eba395cfedda53b1933ccd29478e671e73b420c19c",
- "sha256:fae491d1874f199537fd5872b5e1f0e74a009b979df9d53d1553fd03da1703e1"
- ],
- "index": "ia",
- "version": "==3.5.0"
- },
- "pytest-cov": {
- "hashes": [
- "sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d",
- "sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec"
- ],
- "index": "ia",
- "version": "==2.5.1"
- },
- "pytest-pythonpath": {
- "hashes": [
- "sha256:f3d46b0a8276e856f7dc4f70ca97b88be6fbcf52d57ce36e35057d502388265e"
- ],
- "index": "ia",
- "version": "==0.7.2"
- },
- "requests": {
- "hashes": [
- "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
- "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
- ],
- "index": "ia",
- "version": "==2.18.4"
- },
- "responses": {
- "hashes": [
- "sha256:c6082710f4abfb60793899ca5f21e7ceb25aabf321560cc0726f8b59006811c9",
- "sha256:f23a29dca18b815d9d64a516b4a0abb1fbdccff6141d988ad8100facb81cf7b3"
- ],
- "index": "ia",
- "version": "==0.9.0"
- },
- "simplegeneric": {
- "hashes": [
- "sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173"
- ],
- "version": "==0.8.1"
- },
- "six": {
- "hashes": [
- "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
- "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
- ],
- "version": "==1.11.0"
- },
- "traitlets": {
- "hashes": [
- "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
- "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
- ],
- "version": "==4.3.2"
- },
- "urllib3": {
- "hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
- ],
- "version": "==1.22"
- },
- "wcwidth": {
- "hashes": [
- "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
- "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
- ],
- "version": "==0.1.7"
- },
- "wrapt": {
- "hashes": [
- "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
- ],
- "version": "==1.10.11"
- }
- }
-}
diff --git a/mapreduce/README.md b/mapreduce/README.md
deleted file mode 100644
index aebc160..0000000
--- a/mapreduce/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-
-Hadoop streaming map/reduce jobs written in python using the mrjob library.
-
-## Development and Testing
-
-System dependencies in addition to `../README.md`:
-
-- `libjpeg-dev` (for wayback libraries)
-
-Run the tests with:
-
- pipenv run pytest
-
-Check test coverage with:
-
- pytest --cov --cov-report html
- # open ./htmlcov/index.html in a browser
-
-TODO: Persistant GROBID and HBase during development? Or just use live
-resources?
-
-## Extraction Task
-
-An example actually connecting to HBase from a local machine, with thrift
-running on a devbox and GROBID running on a dedicated machine:
-
- ./extraction_cdx_grobid.py \
- --hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
- --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
- tests/files/example.cdx
-
-Running from the cluster:
-
- # Create tarball of virtualenv
- export PIPENV_VENV_IN_PROJECT=1
- pipenv shell
- export VENVSHORT=`basename $VIRTUAL_ENV`
- tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT .
-
- ./extraction_cdx_grobid.py \
- --hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
- --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
- -r hadoop \
- -c mrjob.conf \
- --archive $VENVSHORT.tar.gz#venv \
- hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx
-
-## Backfill Task
-
-An example actually connecting to HBase from a local machine, with thrift
-running on a devbox:
-
- ./backfill_hbase_from_cdx.py \
- --hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
- tests/files/example.cdx
-
-Actual invocation to run on Hadoop cluster (running on an IA devbox, where
-hadoop environment is configured):
-
- # Create tarball of virtualenv
- export PIPENV_VENV_IN_PROJECT=1
- pipenv install --deploy
- tar -czf venv-current.tar.gz -C .venv .
-
- ./backfill_hbase_from_cdx.py \
- --hbase-host wbgrp-svc263.us.archive.org \
- --hbase-table wbgrp-journal-extract-0-qa \
- -r hadoop \
- -c mrjob.conf \
- --archive $VENVSHORT.tar.gz#venv \
- hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx
diff --git a/mapreduce/TODO b/mapreduce/TODO
deleted file mode 100644
index 4f4db16..0000000
--- a/mapreduce/TODO
+++ /dev/null
@@ -1,4 +0,0 @@
-- quality scoring (of JSON output)
-- use pre-mapper `grep` command to filter down, eg, by status?
-- automation/docs for bundling virtualenv along
-- think about speedups
diff --git a/mapreduce/pytest.ini b/mapreduce/pytest.ini
deleted file mode 100644
index 0a5e921..0000000
--- a/mapreduce/pytest.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-
-[pytest]
-
-# allow imports from files in current directory
-python_paths = .
-
-# search for 'test_*' functions in all python files, not just under tests
-python_files = *.py
diff --git a/mapreduce/xml2json.py b/mapreduce/xml2json.py
deleted file mode 100644
index df5064f..0000000
--- a/mapreduce/xml2json.py
+++ /dev/null
@@ -1,7 +0,0 @@
-
-import json
-import xmltodict
-
-with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
- thing = xmltodict.parse(f, process_namespaces=False)
- print(json.dumps(thing))
diff --git a/match_test_data/NOTES.txt b/match_test_data/NOTES.txt
new file mode 100644
index 0000000..c72f60a
--- /dev/null
+++ b/match_test_data/NOTES.txt
@@ -0,0 +1,13 @@
+
+Converted older .tsv from pdf-extraction comparison work with:
+
+ cat 1k_random_identified_combined.tsv | jq -c --slurp --raw-input --raw-output 'split("\n") | .[:-1] | map(split("\t")) | map({"doi": .[0], "title": .[1], "authors": ( .[2] | split(";") ), "year": .[3], "journal": .[4], "publisher": .[5], "subject": .[6], "type": .[7], "sha": .[8]}) | .[]' > crossref_sample.bibjson
+
+Note that neither bibjson file is a superset of the either:
+
+ 944 unique SHA1 which exist in both lists
+ 964 in crossref_sample.sha1
+ 979 in grobid_sample.sha1
+
+So scoring should be on a basis of "out of 944 lines". If this is confusing we
+can trim the files down.
diff --git a/match_test_data/RESULTS.txt b/match_test_data/RESULTS.txt
new file mode 100644
index 0000000..c776ebb
--- /dev/null
+++ b/match_test_data/RESULTS.txt
@@ -0,0 +1,30 @@
+
+"Out of 944 lines"...
+
+## Git 92584ec4201ecc27af423cbff7b4bc1573edf175
+
+76.27% match.
+
+ time ./please --qa match-benchmark match_test_data/crossref_sample.bibjson match_test_data/grobid_sample.bibjson out.test
+
+ real 0m56.061s
+ user 1m3.852s
+ sys 0m3.924s
+
+ 720 lines
+ 720 uniq DOI
+ 720 uniq SHA1
+
+## Git aa2f905d65713a581c7630ef2f931045059200ef
+
+ real 0m56.347s
+ user 1m3.328s
+ sys 0m4.000s
+
+ bnewbold@orithena$ wc -l out.test
+ 722 out.test
+ bnewbold@orithena$ cut -f3 out.test | jq .doi -r | sort -u | wc -l
+ 722
+ bnewbold@orithena$ cut -f4 out.test | jq .sha1 -r | sort -u | wc -l
+ 722
+
diff --git a/match_test_data/crossref_sample.bibjson b/match_test_data/crossref_sample.bibjson
new file mode 100644
index 0000000..bae26b8
--- /dev/null
+++ b/match_test_data/crossref_sample.bibjson
@@ -0,0 +1,964 @@
+{"doi":"10.1001/archsurg.2012.704","title":"Detection of Colon Cancer Metastases With Fluorescence Laparoscopy in Orthotopic Nude Mouse Models","authors":["Rhiana S. Menen","Sharmeela Kaushal","Cynthia S. Snyder","Mark A. Talamini","Robert M. Hoffman","Michael Bouvet"],"year":"2012","journal":"Archives of Surgery","publisher":"American Medical Association (AMA)","subject":"Medicine(all)","type":"journal-article","sha":"521db80c5f649d11d700b02774a2bdffde99d92c"}
+{"doi":"10.1001/jamadermatol.2015.3008","title":"Seasonal and Geographic Patterns in Tanning Using Real-Time Data From Google Trends","authors":["Bez Toosi","Sunil Kalia"],"year":"2016","journal":"JAMA Dermatology","publisher":"American Medical Association (AMA)","subject":"","type":"journal-article","sha":"b026f1a599b6c5b3d7c5958cb0377ea59e207545"}
+{"doi":"10.1001/jamainternmed.2015.4838","title":"Mediterranean Diet and Invasive Breast Cancer Risk Among Women at High Cardiovascular Risk in the PREDIMED Trial","authors":["Estefanía Toledo","Jordi Salas-Salvadó","Carolina Donat-Vargas","Pilar Buil-Cosiales","Ramón Estruch","Emilio Ros","Dolores Corella","Montserrat Fitó","Frank B. Hu","Fernando Arós","Enrique Gómez-Gracia","Dora Romaguera","Manuel Ortega-Calvo","Lluís Serra-Majem","Xavier Pintó","Helmut Schröder","Josep Basora","José Vicente Sorlí","Mònica Bulló","Merce Serra-Mir","Miguel A. Martínez-González"],"year":"2015","journal":"JAMA Internal Medicine","publisher":"American Medical Association (AMA)","subject":"","type":"journal-article","sha":"bac12831a0eab2dac05efc912083d6819aae2ccc"}
+{"doi":"10.1002/(sici)1098-2272(200004)18:4<293::aid-gepi3>3.3.co;2-c","title":"Score tests for familial correlation in genotypedâ€proband designs","authors":["Raymond J. Carroll","Mitchell H. Gail","Jacques Benichou","David Pee"],"year":"2000","journal":"Genetic Epidemiology","publisher":"Wiley-Blackwell","subject":"Genetics(clinical)","type":"journal-article","sha":"e11b4fb7ecfa071fc5ca8bc22f34c926d3724ad9"}
+{"doi":"10.1002/(sici)1099-047x(199909)9:5<376::aid-mmce2>3.0.co;2-m","title":"Implementation of the local reference node concept for spatially distributed circuits","authors":["Carlos E. Christoffersen","Michael B. Steer"],"year":"1999","journal":"International Journal of RF and Microwave Computer-Aided Engineering","publisher":"Wiley-Blackwell","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"ab5217a9b9813972098b3928b5e3ed49d2118117"}
+{"doi":"10.1002/(sici)1099-1115(199908)13:6<469::aid-acs555>3.0.co;2-o","title":"Quo vadis, Bayesian identification?","authors":["Rudolf Kulhavý","Petya Ivanova"],"year":"1999","journal":"International Journal of Adaptive Control and Signal Processing","publisher":"Wiley-Blackwell","subject":"Control and Systems Engineering","type":"journal-article","sha":"9f811b1265bc77fe113db3d11f8c8b4611c0c68b"}
+{"doi":"10.1002/1097-0142(19801215)46:12+<2759::aid-cncr2820461402>3.0.co;2-7","title":"Historical perspective","authors":["Elwood V. Jensen"],"year":"1980","journal":"Cancer","publisher":"Wiley-Blackwell","subject":"Cancer Research","type":"journal-article","sha":"05b2534518cbfb4b2e8a4f5b34231b4d8bae53d5"}
+{"doi":"10.1002/adma.201503200","title":"Hybrid Modulation-Doping of Solution-Processed Ultrathin Layers of ZnO Using Molecular Dopants","authors":["Stefan P. Schießl","Hendrik Faber","Yen-Hung Lin","Stephan Rossbauer","Qingxiao Wang","Kui Zhao","Aram Amassian","Jana Zaumseil","Thomas D. Anthopoulos"],"year":"2016","journal":"Advanced Materials","publisher":"Wiley-Blackwell","subject":"Mechanical Engineering","type":"journal-article","sha":"244884ce4854750d173da815f764fe9ebbfd8cfb"}
+{"doi":"10.1002/anie.200901723","title":"Encapsulation of Sn@carbon Nanoparticles in Bamboo-like Hollow Carbon Nanofibers as an Anode Material in Lithium-Based Batteries","authors":["Yan Yu","Lin Gu","Chunlei Wang","Abirami Dhanabalan","Peter A. van Aken","Joachim Maier"],"year":"2009","journal":"Angewandte Chemie International Edition","publisher":"Wiley-Blackwell","subject":"Chemistry(all)","type":"journal-article","sha":"da060cc84f015eda7240c725e34b3c3bc19220b1"}
+{"doi":"10.1002/anie.201405353","title":"Targeting Human C-Type Lectin-like Molecule-1 (CLL1) with a Bispecific Antibody for Immunotherapy of Acute Myeloid Leukemia","authors":["Hua Lu","Quan Zhou","Vishal Deshmukh","Hardeep Phull","Jennifer Ma","Virginie Tardif","Rahul R. Naik","Claire Bouvard","Yong Zhang","Seihyun Choi","Brian R. Lawson","Shoutian Zhu","Chan Hyuk Kim","Peter G. Schultz"],"year":"2014","journal":"Angewandte Chemie International Edition","publisher":"Wiley-Blackwell","subject":"Chemistry(all)","type":"journal-article","sha":"081c73dc2c39f5ca85584ee74a3904c770f1ab7d"}
+{"doi":"10.1002/app.26705","title":"In vitro monitoring of surface mechanical properties of poly(L-lactic acid) using microhardness","authors":["C. Saiz-Arroyo","Y. Wang","M. A. Rodriguez-Perez","N. M. Alves","J. F. Mano"],"year":"2007","journal":"Journal of Applied Polymer Science","publisher":"Wiley-Blackwell","subject":"Materials Chemistry","type":"journal-article","sha":"5f8df4a3ea809b3a0ba740a6f1af43b34cf1a221"}
+{"doi":"10.1002/art.27486","title":"Endothelial nitric oxide synthase deficiency results in reduced chondrocyte proliferation and endochondral bone growth","authors":["Qian Yan","Qingping Feng","Frank Beier"],"year":"null","journal":"Arthritis & Rheumatism","publisher":"Wiley-Blackwell","subject":"Immunology","type":"journal-article","sha":"161d5d3d9e13662eb81bff67fd9442e2c8e0c999"}
+{"doi":"10.1002/asmb.742","title":"Sequential design in quality control and validation of land cover databases","authors":["Elisabetta Carfagna","Johnny Marzialetti"],"year":"2009","journal":"Applied Stochastic Models in Business and Industry","publisher":"Wiley-Blackwell","subject":"Management Science and Operations Research","type":"journal-article","sha":"1894b4c446e8471ee23fd783be527c6bb387268f"}
+{"doi":"10.1002/etep.278","title":"Application and comparison of wind speed sampling methods for wind generation in reliability studies using non-sequential Monte Carlo simulations","authors":["F. Vallée","J. Lobry","O. Deblecker"],"year":"2009","journal":"European Transactions on Electrical Power","publisher":"Wiley-Blackwell","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"f11f44afe96169cf27cbcda300baaa18a52f47fe"}
+{"doi":"10.1002/hec.4730040503","title":"Confidence intervals for cost/effectiveness ratios","authors":["Peter Wakker","Marc P. Klaassen"],"year":"1995","journal":"Health Economics","publisher":"Wiley-Blackwell","subject":"Health Policy","type":"journal-article","sha":"710246e5b980481a4be8610d2e72c7380bf4f5c7"}
+{"doi":"10.1002/hep.23011","title":"Tripartite motif-containing 22 inhibits the activity of hepatitis B virus core promoter, which is dependent on nuclear-located RING domain","authors":["Bo Gao","Zhijian Duan","Wei Xu","Sidong Xiong"],"year":"2009","journal":"Hepatology","publisher":"Wiley-Blackwell","subject":"Medicine(all)","type":"journal-article","sha":"2ada09b4b4602b697c81180363a90a24fc6b7624"}
+{"doi":"10.1002/jso.23625","title":"Intraoperative electrochemotherapy of colorectal liver metastases","authors":["Ibrahim Edhemovic","Erik Brecelj","Gorana Gasljevic","Maja Marolt Music","Vojka Gorjup","Barbara Mali","Tomaz Jarm","Bor Kos","Denis Pavliha","Biljana Grcar Kuzmanov","Maja Cemazar","Marko Snoj","Damijan Miklavcic","Eldar M. Gadzijev","Gregor Sersa"],"year":"2014","journal":"Journal of Surgical Oncology","publisher":"Wiley-Blackwell","subject":"Surgery","type":"journal-article","sha":"eb12560dd540655789ba3b4c0b5750b77b5916b0"}
+{"doi":"10.1002/pro.2481","title":"Intrinsic α-helical and β-sheet conformational preferences: A computational case study of alanine","authors":["Diego Caballero","Jukka Määttä","Alice Qinhua Zhou","Maria Sammalkorpi","Corey S. O'Hern","Lynne Regan"],"year":"2014","journal":"Protein Science","publisher":"Wiley-Blackwell","subject":"Biochemistry","type":"journal-article","sha":"60568326f4c94acf58d79de0014bbe3334c83f63"}
+{"doi":"10.1002/smj.2056","title":"Market frictions as building blocks of an organizational economics approach to strategic management","authors":["Joseph T. Mahoney","Lihong Qian"],"year":"2013","journal":"Strategic Management Journal","publisher":"Wiley-Blackwell","subject":"Strategy and Management","type":"journal-article","sha":"dd069e431c76a009ebbcdc6a089c629409d3a9e5"}
+{"doi":"10.1006/anbe.1999.1180","title":"The effects of hunger on locomotory behaviour in two species of wolf spider (Araneae, Lycosidae)","authors":["Sean E. Walker","Samuel D. Marshall","Ann L. Rypstra","Douglus H. Taylor"],"year":"1999","journal":"Animal Behaviour","publisher":"Elsevier BV","subject":"Animal Science and Zoology","type":"journal-article","sha":"da11b5f0fb60fa0312aa87ff3039659ac57f921c"}
+{"doi":"10.1006/eujc.1999.0325","title":"On the Maximal Width of Empty Lattice Simplices","authors":["Christian Haase","Günter M. Ziegler"],"year":"2000","journal":"European Journal of Combinatorics","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"715571c447d1461831065e301cdc257d359d73d4"}
+{"doi":"10.1006/ijhc.1999.0336","title":"Editorial: Evaluating knowledge engineering techniques","authors":["TIM MENZIES","FRANK VAN HARMELEN"],"year":"1999","journal":"International Journal of Human-Computer Studies","publisher":"Elsevier BV","subject":"Engineering(all)","type":"journal-article","sha":"f77a6913c513bd27de190b1a309c40349ce06d13"}
+{"doi":"10.1006/obhd.1999.2843","title":"The Influence of Physical Attractiveness and Gender on Ultimatum Game Decisions","authors":["Sara J. Solnick","Maurice E. Schweitzer"],"year":"1999","journal":"Organizational Behavior and Human Decision Processes","publisher":"Elsevier BV","subject":"Applied Psychology","type":"journal-article","sha":"9211c793469038bc33c6ef5bf424a6317a539805"}
+{"doi":"10.1007/11427834_2","title":"The Rough Set Exploration System","authors":["Jan G. Bazan","Marcin Szczuka"],"year":"2005","journal":"Transactions on Rough Sets III","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"535fafcf004011cb6d88f3ff7714c5b5a81c4319"}
+{"doi":"10.1007/11562948_4","title":"Verifying Very Large Industrial Circuits Using 100 Processes and Beyond","authors":["Limor Fix","Orna Grumberg","Amnon Heyman","Tamir Heyman","Assaf Schuster"],"year":"2005","journal":"Automated Technology for Verification and Analysis","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"377991918248bd9c2026fbaa506447742a89445d"}
+{"doi":"10.1007/11572831_29","title":"QoS Management in Fixed Broadband Residential Gateways","authors":["C. Guerrero","J. Garcia","F. Valera","A. Azcorra"],"year":"2005","journal":"Management of Multimedia Networks and Services","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"8b079f9f5ff40d82e9af308b7884b191541c9c3b"}
+{"doi":"10.1007/11780991_23","title":"How Deep Should It Be? On the Optimality of Hierarchical Architectures","authors":["Amihai Motro","Alessandro D’Atri","Eli Gafni"],"year":"2006","journal":"Next Generation Information Technologies and Systems","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"e0e8a89f29636d7a8b867d9bc2aea430d6362c11"}
+{"doi":"10.1007/3-540-44745-8_41","title":"Gabor Feature Space Diffusion via the Minimal Weighted Area Method","authors":["Chen Sagiv","Nir A. Sochen","Yehoshua Y. Zeevi"],"year":"2001","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"64440112f9ca75867542f1152b465d993e45c5c6"}
+{"doi":"10.1007/3-540-45351-2_36","title":"Diagnosis of Physical Systems with Hybrid Models Using Parametrized Causality","authors":["Pieter J. Mosterman"],"year":"2001","journal":"Hybrid Systems: Computation and Control","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"479f7d842d0b8b575f6617898c2c7fca586756bb"}
+{"doi":"10.1007/3-540-48199-0_14","title":"Transforming Linear Context-Free Rewriting Systems into Minimalist Grammars","authors":["Jens Michaelis"],"year":"2001","journal":"Logical Aspects of Computational Linguistics","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"db93dd5dc26bbe125380f862ad0a268538305c49"}
+{"doi":"10.1007/3-540-49255-0_140","title":"Run-Time Adaptability of Synchronization Policies in Concurrent Object Oriented Languages","authors":["Fernando Sánchez","Juan Hernández","Juan Manuel Murillo","Enrique Pedraza"],"year":"1998","journal":"Object-Oriented Technology: ECOOP’98 Workshop Reader","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"5d8550cc6479d6f5cf1ad37e25cc6e613f957d74"}
+{"doi":"10.1007/978-0-387-98078-2_10","title":"Designing a Regulatory and Supervisory Framework for Integrated Financial Markets","authors":["Giorgio Di Giorgio","Carmine Di Noia"],"year":"2009","journal":"The Changing Geography of Banking and Finance","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"b495000629d9bd73978e7d740eac2450e0b36430"}
+{"doi":"10.1007/978-1-349-18271-8","title":"Computer Communications","authors":["Robert Cole"],"year":"1986","journal":"","publisher":"Springer Nature","subject":"","type":"book","sha":"b79d697849372094f8540e22a096f4764fe6f984"}
+{"doi":"10.1007/978-1-4419-1325-8_4","title":"Processing Constrained k-Closest Pairs Queries in Crime Databases","authors":["Shaojie Qiao","Changjie Tang","Huidong Jin","Shucheng Dai","Xingshu Chen","Michael Chau","Jian Hu"],"year":"2010","journal":"Annals of Information Systems","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"64eb08c61ad0ddaaf756f21bcfebb3c988b7f22b"}
+{"doi":"10.1007/978-1-4842-2671-1_10","title":"Object Oriented Programming","authors":["Thomas Mailund"],"year":"2017","journal":"Beginning Data Science in R","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"d9283460d1b1f10cfb5f9c3a4ffbf4e8d3a70f92"}
+{"doi":"10.1007/978-3-319-01571-2_22","title":"A New Proof System to Verify GDT Agents","authors":["Bruno Mermet","Gaele Simon"],"year":"2014","journal":"Studies in Computational Intelligence","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"0e2f639cfe1e502c076a43cabcb34fa8036a282d"}
+{"doi":"10.1007/978-3-319-08918-8_14","title":"Preciseness of Subtyping on Intersection and Union Types","authors":["Mariangiola Dezani-Ciancaglini","Silvia Ghilezan"],"year":"2014","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"cf1d2c008f6ed3722301d2fe808b22d2aaa4927e"}
+{"doi":"10.1007/978-3-319-13704-9_13","title":"Ontology Design Pattern Property Specialisation Strategies","authors":["Karl Hammar"],"year":"2014","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"620f9fbb15b8292170f5983c3453ea0f28ee0baf"}
+{"doi":"10.1007/978-3-540-24622-0_4","title":"Construction of a Semantic Model for a Typed Assembly Language","authors":["Gang Tan","Andrew W. Appel","Kedar N. Swadi","Dinghao Wu"],"year":"2004","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"b287f3d8b7bb272962d6771c227fc690cf28402a"}
+{"doi":"10.1007/978-3-540-24673-2_40","title":"Galilean Differential Geometry of Moving Images","authors":["Daniel Fagerström"],"year":"2004","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"16fa31f3ff56bd670dae3e046f5f60ddc47b5cac"}
+{"doi":"10.1007/978-3-540-24855-2_125","title":"Designing Multiplicative General Parameter Filters Using Adaptive Genetic Algorithms","authors":["Jarno Martikainen","Seppo J. Ovaska"],"year":"2004","journal":"Genetic and Evolutionary Computation – GECCO 2004","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"56ad4991aad40daa5c6da61d9788337b6c844ba9"}
+{"doi":"10.1007/978-3-540-30222-3_29","title":"Comparing Weighting Models for Monolingual Information Retrieval","authors":["Gianni Amati","Claudio Carpineto","Giovanni Romano"],"year":"2004","journal":"Comparative Evaluation of Multilingual Information Access Systems","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"9ed309f4aec835264863d1ce9746fc7edf2c14d1"}
+{"doi":"10.1007/978-3-540-32439-3_12","title":"Relaxing Planarity for Topological Graphs","authors":["János Pach","RadoÅ¡ RadoiÄić","Géza Tóth"],"year":"2006","journal":"Bolyai Society Mathematical Studies","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"0b1c525f6caaffff9e8010ec4ba276e460a017e7"}
+{"doi":"10.1007/978-3-540-39901-8_1","title":"Global Optimization and Constraint Satisfaction: The Branch-and-Reduce Approach","authors":["Nikolaos V. Sahinidis"],"year":"2003","journal":"Global Optimization and Constraint Satisfaction","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"dd5ba4cb46a5457c8065afb9fa096bde00406bb3"}
+{"doi":"10.1007/978-3-540-45198-3_18","title":"Computational Analogues of Entropy","authors":["Boaz Barak","Ronen Shaltiel","Avi Wigderson"],"year":"2003","journal":"Approximation, Randomization, and Combinatorial Optimization.. Algorithms and Techniques","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"4a60bf62c998a6d0bf85dea13146162e7cad8e42"}
+{"doi":"10.1007/978-3-540-70600-7_35","title":"Suffix Tree Characterization of Maximal Motifs in Biological Sequences","authors":["Maria Federico","Nadia Pisanti"],"year":"null","journal":"Communications in Computer and Information Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"7efc69adbe1431ffae9b0d60bce8152e8118cf3f"}
+{"doi":"10.1007/978-3-540-72079-9_13","title":"Adaptive Content Presentation for the Web","authors":["Andrea Bunt","Giuseppe Carenini","Cristina Conati"],"year":"null","journal":"The Adaptive Web","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"402a43745567aa0fe0dcd6bf3aa1eb0b21335098"}
+{"doi":"10.1007/978-3-540-72521-3_20","title":"Optimal Bitwise Register Allocation Using Integer Linear Programming","authors":["Rajkishore Barik","Christian Grothoff","Rahul Gupta","Vinayaka Pandit","Raghavendra Udupa"],"year":"null","journal":"Languages and Compilers for Parallel Computing","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"4174ccdcc1b314ce1350b84cb37ed0e20ca48dee"}
+{"doi":"10.1007/978-3-642-01112-2_3","title":"Robust Web Services Provisioning through On-Demand Replication","authors":["Quan Z. Sheng","Zakaria Maamar","Jian Yu","Anne H. H. Ngu"],"year":"2009","journal":"Lecture Notes in Business Information Processing","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"2a9173cfcd7338e1b965db40e38f0851fe6030d4"}
+{"doi":"10.1007/978-3-642-03641-5_29","title":"Reconstructing Optical Flow Fields by Motion Inpainting","authors":["Benjamin Berkels","Claudia Kondermann","Christoph Garbe","Martin Rumpf"],"year":"2009","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"a621af9501677022a01965396fc0b21f63c93f7a"}
+{"doi":"10.1007/978-3-642-12297-2_48","title":"Interactive Super-Resolution through Neighbor Embedding","authors":["Jian Pu","Junping Zhang","Peihong Guo","Xiaoru Yuan"],"year":"2010","journal":"Computer Vision – ACCV 2009","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"a6d5812b8e9603a524a7af90312bc633cbf47a41"}
+{"doi":"10.1007/978-3-642-16327-2_34","title":"An English-Arabic Bi-directional Machine Translation Tool in the Agriculture Domain","authors":["Khaled Shaalan","Ashraf Hendam","Ahmed Rafea"],"year":"2010","journal":"Intelligent Information Processing V","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"f5b17dd68c18848901c6a7099a099b68fa6f3014"}
+{"doi":"10.1007/978-3-642-24279-3_11","title":"An Experiment in Hierarchical Recognition of Group Activities Using Wearable Sensors","authors":["Dawud Gordon","Jan-Hendrik Hanne","Martin Berchtold","Takashi Miyaki","Michael Beigl"],"year":"2011","journal":"Modeling and Using Context","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"e0b997484c18db4398ead336087850ab10888a2e"}
+{"doi":"10.1007/978-3-642-25093-4_23","title":"DC Proposal: Model for News Filtering with Named Entities","authors":["Ivo Lašek"],"year":"2011","journal":"The Semantic Web – ISWC 2011","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"5b05ed96f8e433fa55d3d756145312358a9e6a67"}
+{"doi":"10.1007/978-3-642-30382-1_23","title":"Competition among Telecommunication Providers","authors":["Patrick Maillé","Peter Reichl","Bruno Tuffin"],"year":"2012","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"5cc25532b6df7989f3c1709edc8fcaff8aa8da59"}
+{"doi":"10.1007/978-3-642-32714-8_3","title":"Robust Phase-Correlation Based Registration of Airborne Videos Using Motion Estimation","authors":["Frank de Morsier","Maurice Borgeaud","Christoph Küchler","Adrian Vogel","Volker Gass","Jean-Philippe Thiran"],"year":"2013","journal":"Lecture Notes in Geoinformation and Cartography","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"9c25585364c4d04fb57c92d112ce3b0818c328b4"}
+{"doi":"10.1007/978-3-642-37658-0_12","title":"Task Parallelism and Data Distribution: An Overview of Explicit Parallel Programming Languages","authors":["Dounia Khaldi","Pierre Jouvelot","Corinne Ancourt","François Irigoin"],"year":"2013","journal":"Languages and Compilers for Parallel Computing","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"82eb4bc90c4d9a9b2f924996e3583942b4cad391"}
+{"doi":"10.1007/978-3-642-40543-3_40","title":"A Consolidated DaaS Model for Situation-Informed Incident Management","authors":["Nan Jiang","Lai Xu","Paul De Vrieze"],"year":"2013","journal":"IFIP Advances in Information and Communication Technology","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"2f20f3d0db467fc77eb19689c91091c89268089d"}
+{"doi":"10.1007/978-3-642-60597-0_4","title":"Large-Scale Simulations of Melting in Two-Dimensional Lennard-Jones Systems: Evidence for a Metastable Hexatic Phase","authors":["K. Chen","T. Kaplan","M. Mostoller"],"year":"1997","journal":"Springer Proceedings in Physics","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"987cf8f9536b71ac51abe4c0cbdecad5fab2fd2a"}
+{"doi":"10.1007/978-3-662-47157-9_8","title":"Introducing a Socio-Technical Perspective on Business Processes into Enterprise Interoperability Frameworks","authors":["Charles Crick","Eng K. Chew"],"year":"2015","journal":"Lecture Notes in Business Information Processing","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"9f84b71a8bcacf2d57e61f78688c34c8137028bd"}
+{"doi":"10.1007/978-94-015-9486-8_2","title":"The Comitology Game: European Policymaking with Parliamentary Involvement","authors":["Bernard Steunenberg","Dieter Schmidtchen"],"year":"2000","journal":"Library of Public Policy and Public Administration","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"20c9deda71acca3f754dc5ae01780c3115c41802"}
+{"doi":"10.1007/bf00399368","title":"Formation mechanisms of illite, chlorite and mixed-layer illite-chlorite in Triassic volcanogenic sediments from the Southland Syncline, New Zealand","authors":["Jung Ho Ahn","Donald R. Peacor","Douglas S. Coombs"],"year":"1988","journal":"Contributions to Mineralogy and Petrology","publisher":"Springer Nature","subject":"Geochemistry and Petrology","type":"journal-article","sha":"e1546a48044eef921731ebdedaf5cd7b9650abf0"}
+{"doi":"10.1007/bf01002389","title":"Applications of staffing, scheduling, and budgeting methodologies to hospital ancillary units","authors":["Myeng-Ki Kim","Walton M. Hancock"],"year":"1989","journal":"Journal of Medical Systems","publisher":"Springer Nature","subject":"Health Informatics","type":"journal-article","sha":"ada7455f31f400af53a2ff3e7b43a34f118b6030"}
+{"doi":"10.1007/bf01151577","title":"Comparison of thallium-201 SPECT redistribution patterns and rubidium-82 PET rest-stress myocardial blood flow imaging","authors":["Richard E. Stewart","Jeffrey Popma","Gerald M. Gacioch","Morton Kalus","Sheila Squicciarini","Ziad Al-Aouar","M. Anthony Schork","Markus Schwaiger"],"year":"1994","journal":"The International Journal of Cardiac Imaging","publisher":"Springer Nature","subject":"Radiological and Ultrasound Technology","type":"journal-article","sha":"c9236bf7c663698bdc4484acfbe1e623cc2a1fdd"}
+{"doi":"10.1007/bf01385523","title":"Symplectic phase flow approximation for the numerical integration of canonical systems","authors":["S. Miesbach","H. J. Pesch"],"year":"1992","journal":"Numerische Mathematik","publisher":"Springer Nature","subject":"Applied Mathematics","type":"journal-article","sha":"a524d479ef5f45bba366c6e56bd8233ab81cc6f1"}
+{"doi":"10.1007/bf01588971","title":"An analysis of approximations for maximizing submodular set functions—I","authors":["G. L. Nemhauser","L. A. Wolsey","M. L. Fisher"],"year":"1978","journal":"Mathematical Programming","publisher":"Springer Nature","subject":"Software","type":"journal-article","sha":"b9e43395663f74c581982e9ca97a0d7057a0008c"}
+{"doi":"10.1007/bf01630895","title":"The medial and inferior calcaneal nerves: an anatomic study","authors":["S. Louisia","A. C. Masquelet"],"year":"1999","journal":"Surgical and Radiologic Anatomy","publisher":"Springer Nature","subject":"Anatomy","type":"journal-article","sha":"ba6cc85fe7868538125a8034f617eb3136505648"}
+{"doi":"10.1007/bf01832010","title":"Barocke Thematik in der Lyrik des Andreas Gryphius","authors":["Edelgard E. Conradt"],"year":"1956","journal":"Neophilologus","publisher":"Springer Nature","subject":"Linguistics and Language","type":"journal-article","sha":"debe170a4804971677c3c0cf5e6009eb59cbafdb"}
+{"doi":"10.1007/bf02533128","title":"A calorimeter coupled with a magnetic spectrometer for the detection of primary cosmic antiprotons","authors":["G. Basini","A. Morselli","M. Occhigrossi","M. Ricci","P. Spillantini","F. Bongiorno","P. Picozza","A. Codino","M. Menichelli","S. Bartalucci"],"year":"1988","journal":"Il Nuovo Cimento C","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"7e66c70d2d2a38a3cdaf330848960e08abbee3f5"}
+{"doi":"10.1007/bf02948747","title":"Genetic Programming with simple loops","authors":["Yuesheng Qi","Baozhong Wang","Lishan Kang"],"year":"1999","journal":"Journal of Computer Science and Technology","publisher":"Springer Nature","subject":"Theoretical Computer Science","type":"journal-article","sha":"4b828892a3960a2820aed706f502547d08bbef55"}
+{"doi":"10.1007/bf02987464","title":"Environmental chemistry education in Europe: Setting the agenda","authors":["Uri Zoller"],"year":"2002","journal":"Environmental Science and Pollution Research","publisher":"Springer Nature","subject":"Pollution","type":"journal-article","sha":"9e61373b9a790999d1128c71e0c2350c27a97f1f"}
+{"doi":"10.1007/bfb0054145","title":"A practical mix","authors":["Markus Jakobsson"],"year":"1998","journal":"Lecture Notes in Computer Science","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"b71edd44be9d2fdee2e4b69955409ee94fa2ab27"}
+{"doi":"10.1007/s00010-010-0056-8","title":"Christensen measurability and some functional equation","authors":["Eliza Jabłońska"],"year":"2011","journal":"Aequationes mathematicae","publisher":"Springer Nature","subject":"Applied Mathematics","type":"journal-article","sha":"8f0a1f76023ceafbf0fa37be1a03862677db26ec"}
+{"doi":"10.1007/s00158-014-1097-8","title":"Optimal design of commercial vehicle systems using analytical target cascading","authors":["Namwoo Kang","Michael Kokkolaras","Panos Y. Papalambros","Seungwon Yoo","Wookjin Na","Jongchan Park","Dieter Featherman"],"year":"2014","journal":"Structural and Multidisciplinary Optimization","publisher":"Springer Nature","subject":"Control and Systems Engineering","type":"journal-article","sha":"e98fc3c58b67c3709b442d9c77dbb2305ad32106"}
+{"doi":"10.1007/s00192-008-0734-4","title":"Is there any evidence to advocate SUI prevention in continent women undergoing prolapse repair? An overview","authors":["B. Fatton"],"year":"2009","journal":"International Urogynecology Journal","publisher":"Springer Nature","subject":"Obstetrics and Gynaecology","type":"journal-article","sha":"5052680d334decd0e0dce66fe18b74297495a50a"}
+{"doi":"10.1007/s00199-009-0471-z","title":"Holdup, search, and inefficiency","authors":["Shingo Ishiguro"],"year":"2010","journal":"Economic Theory","publisher":"Springer Nature","subject":"Economics and Econometrics","type":"journal-article","sha":"a3d2c453f39527891470ae3b9caf246a90b90e62"}
+{"doi":"10.1007/s002530050889","title":"Corrosion inhibition by aerobic biofilms on SAE 1018 steel","authors":["A. Jayaraman","J. C. Earthman","T. K. Wood"],"year":"1997","journal":"Applied Microbiology and Biotechnology","publisher":"Springer Nature","subject":"Biotechnology","type":"journal-article","sha":"f7cd8b336a0d135cc8ec5b577e7aec032b63a1b3"}
+{"doi":"10.1007/s00338-013-1045-x","title":"Initial spread of the invasive green alga Caulerpa verticillata over coral reef communities in the Gulf of California","authors":["C. J. Pérez-Estrada","R. Rodríguez-Estrella","D. S. Palacios-Salgado","D. A. Paz-García"],"year":"2013","journal":"Coral Reefs","publisher":"Springer Nature","subject":"Aquatic Science","type":"journal-article","sha":"f7601e6e94fc4f863fbb61f71877932d12556c0d"}
+{"doi":"10.1007/s00339-010-6169-z","title":"Liquid-immersion laser micromachining of GaN grown on sapphire","authors":["Giuseppe Y. Mak","Edmund Y. Lam","H. W. Choi"],"year":"2011","journal":"Applied Physics A","publisher":"Springer Nature","subject":"Materials Science(all)","type":"journal-article","sha":"758a35b7eebf0404abd380cec1c9c6de8cc2bb4f"}
+{"doi":"10.1007/s00348-007-0410-3","title":"Experimental observation using particle image velocimetry of inertial waves in a rotating fluid","authors":["Laura Messio","Cyprien Morize","Marc Rabaud","Frédéric Moisy"],"year":"2008","journal":"Experiments in Fluids","publisher":"Springer Nature","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"c75901f73105b3f724350f9f555626242f893203"}
+{"doi":"10.1007/s00355-009-0414-4","title":"Hugh LaFollette: The Practice of Ethics","authors":["Alex Voorhoeve"],"year":"2010","journal":"Social Choice and Welfare","publisher":"Springer Nature","subject":"Economics and Econometrics","type":"journal-article","sha":"a65aa4239ed749f4fca3f629c2c001e792a66514"}
+{"doi":"10.1007/s00383-005-1500-z","title":"Advances in short bowel syndrome: an updated review","authors":["Igor Sukhotnik","Arnold G. Coran","Alexander Kramer","Eitan Shiloni","Jorge G. Mogilner"],"year":"2005","journal":"Pediatric Surgery International","publisher":"Springer Nature","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"6d89b088163d6d670a509927b5796af15d59f573"}
+{"doi":"10.1007/s00401-015-1399-y","title":"Liquid biopsies in patients with diffuse glioma","authors":["Myron G. Best","Nik Sol","Sebastiaan Zijl","Jaap C. Reijneveld","Pieter Wesseling","Thomas Wurdinger"],"year":"2015","journal":"Acta Neuropathologica","publisher":"Springer Nature","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"a89b878ffdd46b8fff31f379f6ceab6f8164dc36"}
+{"doi":"10.1007/s00407-010-0075-9","title":"Testing universal gravitation in the laboratory, or the significance of research on the mean density of the earth and big G, 1798–1898: changing pursuits and long-term methodological–experimental continuity","authors":["Steffen Ducheyne"],"year":"2011","journal":"Archive for History of Exact Sciences","publisher":"Springer Nature","subject":"History and Philosophy of Science","type":"journal-article","sha":"d990ae8b0b55d8b39e807b9cb8108e891ee2e135"}
+{"doi":"10.1007/s00422-009-0327-4","title":"What can be learnt from analysing insect orientation flights using probabilistic SLAM?","authors":["Bartholomew Baddeley","Andrew Philippides","Paul Graham","Natalie Hempel de Ibarra","Thomas Collett","Phillip Husbands"],"year":"2009","journal":"Biological Cybernetics","publisher":"Springer Nature","subject":"Biotechnology","type":"journal-article","sha":"ee8486e840f16483738ec1910939fa62ec69e051"}
+{"doi":"10.1007/s00441-013-1735-2","title":"Possible role of gap junction intercellular channels and connexin 43 in satellite glial cells (SGCs) for preservation of human spiral ganglion neurons","authors":["Wei Liu","Rudolf Glueckert","Fred H. Linthicum","Gunde Rieger","Michael Blumer","Mario Bitsche","Elisabeth Pechriggl","Helge Rask-Andersen","Annelies Schrott-Fischer"],"year":"2014","journal":"Cell and Tissue Research","publisher":"Springer Nature","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"80ec0dee732ee5c22c3e06a7a0a0f9ca8ab8b2b0"}
+{"doi":"10.1007/s00542-009-1010-3","title":"Fast replication of out-of-plane microlens with polydimethylsiloxane and curable polymer (NOA73)","authors":["Guocheng Shao","Weiping Qiu","Wanjun Wang"],"year":"2010","journal":"Microsystem Technologies","publisher":"Springer Nature","subject":"Hardware and Architecture","type":"journal-article","sha":"46ed1a2249005c2ff0992ccd04857227f83cce76"}
+{"doi":"10.1007/s00705-005-0712-9","title":"Yellow leaf of sugarcane is caused by at least three different genotypes of sugarcane yellow leaf virus, one of which predominates on the Island of Réunion","authors":["Y. Abu Ahmad","L. Rassaby","M. Royer","Z. Borg","K. S. Braithwaite","T. E. Mirkov","M. S. Irey","X. Perrier","G. R. Smith","P. Rott"],"year":"2006","journal":"Archives of Virology","publisher":"Springer Nature","subject":"Virology","type":"journal-article","sha":"ed696f4a0899e67d343db0626fb53a3dbab29336"}
+{"doi":"10.1007/s00726-014-1873-1","title":"Inhibitors of amino acids biosynthesis as antifungal agents","authors":["Kamila Jastrzębowska","Iwona Gabriel"],"year":"2015","journal":"Amino Acids","publisher":"Springer Nature","subject":"Organic Chemistry","type":"journal-article","sha":"577c2feee29191dfc31f143617b4d52e6343f076"}
+{"doi":"10.1007/s10162-006-0058-3","title":"Differential Distribution of Stem Cells in the Auditory and Vestibular Organs of the Inner Ear","authors":["Kazuo Oshima","Christian M. Grimm","C. Eduardo Corrales","Pascal Senn","Rodrigo Martinez Monedero","Gwenaëlle S. G. Géléoc","Albert Edge","Jeffrey R. Holt","Stefan Heller"],"year":"2007","journal":"Journal of the Association for Research in Otolaryngology","publisher":"Springer Nature","subject":"Sensory Systems","type":"journal-article","sha":"c9b376a4928ae84dad0cf2eacb2bc3d63c14b8d0"}
+{"doi":"10.1007/s10182-014-0225-5","title":"Testing monotonicity of pricing kernels","authors":["Yuri Golubev","Wolfgang K. Härdle","Roman Timofeev"],"year":"2014","journal":"AStA Advances in Statistical Analysis","publisher":"Springer Nature","subject":"Modelling and Simulation","type":"journal-article","sha":"c21fdc00e8ef984da700c9c8b41be80d5214271f"}
+{"doi":"10.1007/s101820400158","title":"Kurtosis modelling by means of the J-transformation","authors":["Matthias Fischer","Ingo Klein"],"year":"2004","journal":"Allgemeines Statistisches Archiv","publisher":"Springer Nature","subject":"Statistics and Probability","type":"journal-article","sha":"84b1f1663e1f155d5c7c939d2a147e73038abfba"}
+{"doi":"10.1007/s10267-010-0094-z","title":"A survey of proteases in edible mushrooms with synthetic peptides as substrates","authors":["Mayumi Nakamura","Aya Iketani","Yuzo Shioi"],"year":"2011","journal":"Mycoscience","publisher":"Elsevier BV","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"842e64035bd43e5d9de23556e165287e6dfc9fe5"}
+{"doi":"10.1007/s10273-005-0451-2","title":"Die ökonomischen Konsequenzen der Korruption","authors":[" "],"year":"2005","journal":"Wirtschaftsdienst","publisher":"Springer Nature","subject":"Business, Management and Accounting (miscellaneous)","type":"journal-article","sha":"84b1c80eb3aae857900e6ce5c5acaeff228b1d11"}
+{"doi":"10.1007/s10489-006-0018-y","title":"Genetic operators for combinatorial optimization in TSP and microarray gene ordering","authors":["Shubhra Sankar Ray","Sanghamitra Bandyopadhyay","Sankar K. Pal"],"year":"2007","journal":"Applied Intelligence","publisher":"Springer Nature","subject":"Artificial Intelligence","type":"journal-article","sha":"d0a6d13460ebdd8259590f1c014ff51353802af3"}
+{"doi":"10.1007/s10508-011-9803-8","title":"Body Odor Quality Predicts Behavioral Attractiveness in Humans","authors":["S. Craig Roberts","Alexandra Kralevich","Camille Ferdenzi","Tamsin K. Saxton","Benedict C. Jones","Lisa M. DeBruine","Anthony C. Little","Jan Havlicek"],"year":"2011","journal":"Archives of Sexual Behavior","publisher":"Springer Nature","subject":"Psychology(all)","type":"journal-article","sha":"b509d2cd399e39bef36349dd6437b54ac0c9d623"}
+{"doi":"10.1007/s10508-012-9956-0","title":"Masturbation is Related to Psychopathology and Prostate Dysfunction: Comment on Quinsey (2012)","authors":["Rui Miguel Costa"],"year":"2012","journal":"Archives of Sexual Behavior","publisher":"Springer Nature","subject":"Psychology(all)","type":"journal-article","sha":"e634311c9170674fd734aaf8779153205ab5238f"}
+{"doi":"10.1007/s10546-012-9701-1","title":"The Effect of Scale on the Applicability of Taylor’s Frozen Turbulence Hypothesis in the Atmospheric Boundary Layer","authors":["Chad W. Higgins","Martin Froidevaux","Valentin Simeonov","Nikki Vercauteren","Caitlin Barry","Marc B. Parlange"],"year":"2012","journal":"Boundary-Layer Meteorology","publisher":"Springer Nature","subject":"Atmospheric Science","type":"journal-article","sha":"0298cfd505d89679b4acde582656896634e5e6e0"}
+{"doi":"10.1007/s10663-008-9084-1","title":"Does Benford’s Law hold in economic research and forecasting?","authors":["Stefan Günnel","Karl-Heinz Tödter"],"year":"2009","journal":"Empirica","publisher":"Springer Nature","subject":"Geography, Planning and Development","type":"journal-article","sha":"a3d468938790944bd9b4a8cfbe102ce4b972cfa7"}
+{"doi":"10.1007/s10725-011-9599-5","title":"Effect of traumatic acid on antioxidant activity in Chlorella vulgaris (Chlorophyceae)","authors":["Anna Pietryczuk","Romuald Czerpak"],"year":"2011","journal":"Plant Growth Regulation","publisher":"Springer Nature","subject":"Agronomy and Crop Science","type":"journal-article","sha":"e546d1c06c234baacc1fb0b1ff3b0659805e1c84"}
+{"doi":"10.1007/s10803-005-5037-8","title":"Timing of Prenatal Stressors and Autism","authors":["D. Q. Beversdorf","S. E. Manning","A. Hillier","S. L. Anderson","R. E. Nordgren","S. E. Walters","H. N. Nagaraja","W. C. Cooley","S. E. Gaelic","M. L. Bauman"],"year":"2005","journal":"Journal of Autism and Developmental Disorders","publisher":"Springer Nature","subject":"Developmental and Educational Psychology","type":"journal-article","sha":"ac59503663e07f6a1747647d7dd78d56d40c4e9c"}
+{"doi":"10.1007/s10955-006-9062-6","title":"The Boltzmann Equation for Driven Systems of Inelastic Soft Spheres","authors":["M. H. Ernst","E. Trizac","A. Barrat"],"year":"2006","journal":"Journal of Statistical Physics","publisher":"Springer Nature","subject":"Mathematical Physics","type":"journal-article","sha":"5a0dc993f52025b61ee6ac86a94b9d523a83a94c"}
+{"doi":"10.1007/s11065-009-9104-3","title":"Special Section of Neuropsychology Review on HIV/NeuroAIDS","authors":["Edith V. Sullivan"],"year":"2009","journal":"Neuropsychology Review","publisher":"Springer Nature","subject":"Neuropsychology and Physiological Psychology","type":"journal-article","sha":"a00de21eb383e61bfa93613f6e88ff90704c80e0"}
+{"doi":"10.1007/s11127-012-9954-8","title":"The devil is in the shadow. Do institutions affect income and productivity or only official income and official productivity?","authors":["Axel Dreher","Pierre-Guillaume Méon","Friedrich Schneider"],"year":"2014","journal":"Public Choice","publisher":"Springer Nature","subject":"Economics and Econometrics","type":"journal-article","sha":"a03c7548abd22d4ba93260e4c2bfe297fa0488fe"}
+{"doi":"10.1007/s11219-013-9205-3","title":"Prediction of faults-slip-through in large software projects: an empirical evaluation","authors":["Wasif Afzal","Richard Torkar","Robert Feldt","Tony Gorschek"],"year":"2014","journal":"Software Quality Journal","publisher":"Springer Nature","subject":"Media Technology","type":"journal-article","sha":"0e66f3fe27bde5f22e73eb7c58b71796ad57069a"}
+{"doi":"10.1007/s11222-009-9162-7","title":"Estimation and regularization techniques for regression models with multidimensional prediction functions","authors":["Matthias Schmid","Sergej Potapov","Annette Pfahlberg","Torsten Hothorn"],"year":"2010","journal":"Statistics and Computing","publisher":"Springer Nature","subject":"Theoretical Computer Science","type":"journal-article","sha":"92c2868b73f2d1f4701296c07b5c823eed437acf"}
+{"doi":"10.1007/s11263-013-0673-5","title":"Mixture of Trees Probabilistic Graphical Model for Video Segmentation","authors":["Vijay Badrinarayanan","Ignas Budvytis","Roberto Cipolla"],"year":"2014","journal":"International Journal of Computer Vision","publisher":"Springer Nature","subject":"Software","type":"journal-article","sha":"432186760fe45c7f535a2c00bbb2b9ad77f136db"}
+{"doi":"10.1007/s11606-010-1514-7","title":"Racial/Ethnic Disparities in Exercise and Dietary Behaviors of Middle-Aged and Older Adults","authors":["Kristin J. August","Dara H. Sorkin"],"year":"2011","journal":"Journal of General Internal Medicine","publisher":"Springer Nature","subject":"Internal Medicine","type":"journal-article","sha":"828c11bda4fb8ac131100febcb3e31915cadeec2"}
+{"doi":"10.1007/s11786-008-0052-8","title":"Automatic Proof of Graph Nonisomorphism","authors":["Arjeh M. Cohen","Jan Willem Knopper","Scott H. Murray"],"year":"2008","journal":"Mathematics in Computer Science","publisher":"Springer Nature","subject":"Computational Theory and Mathematics","type":"journal-article","sha":"0ce718957cbc4cd5b59a8e9c53104d860775f9db"}
+{"doi":"10.1007/s11805-007-0147-z","title":"A case of epithelioid hemangioendothelioma in the liver","authors":["Chunrong Ye","Wen Wang"],"year":"2007","journal":"Chinese Journal of Clinical Oncology","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"5c9b697566f3ef07cb11e62854f29df91b3eeb63"}
+{"doi":"10.1007/s12013-012-9430-6","title":"Z-disc Transcriptional Coupling, Sarcomeroptosis and Mechanopoptosis","authors":["Ralph Knöll","Byambajav Buyandelger"],"year":"2013","journal":"Cell Biochemistry and Biophysics","publisher":"Springer Nature","subject":"Biophysics","type":"journal-article","sha":"06ae5d02cf7e757c964e12956307dc69d765e5a8"}
+{"doi":"10.1007/s12024-011-9269-y","title":"Animals, autopsies and artefacts","authors":["Roger W. Byard"],"year":"2011","journal":"Forensic Science, Medicine, and Pathology","publisher":"Springer Nature","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"e21d7523fbec45d0e58437c9118910cb300a48a4"}
+{"doi":"10.1007/s12035-010-8137-1","title":"Why Pleiotropic Interventions are Needed for Alzheimer's Disease","authors":["Sally A. Frautschy","Greg M. Cole"],"year":"2010","journal":"Molecular Neurobiology","publisher":"Springer Nature","subject":"Cellular and Molecular Neuroscience","type":"journal-article","sha":"288182c252807745fd6bcca68972172e80cf1662"}
+{"doi":"10.1007/s12064-013-0194-3","title":"Metabolic scaling theory in plant biology and the three oxygen paradoxa of aerobic life","authors":["Ulrich Kutschera","Karl J. Niklas"],"year":"2013","journal":"Theory in Biosciences","publisher":"Springer Nature","subject":"Statistics and Probability","type":"journal-article","sha":"eb4346d501a25bbeda2ed17a96fd21b47dc253b0"}
+{"doi":"10.1007/s12185-010-0700-z","title":"AA amyloidosis associated with macroglobulinemia","authors":["Raine Tatara","Tadashi Nagai","Hiroyuki Kobayashi","Kaoru Hatano","Takahiro Suzuki","Kazuo Muroi","Keiya Ozawa"],"year":"2010","journal":"International Journal of Hematology","publisher":"Springer Nature","subject":"Hematology","type":"journal-article","sha":"e6320352719e48949dd95748fce231f56bab0203"}
+{"doi":"10.1007/s12194-008-0031-4","title":"Automated segmentation of hepatic vessels in non-contrast X-ray CT images","authors":["Suguru Kawajiri","Xiangrong Zhou","Xuejun Zhang","Takeshi Hara","Hiroshi Fujita","Ryujiro Yokoyama","Hiroshi Kondo","Masayuki Kanematsu","Hiroaki Hoshi"],"year":"2008","journal":"Radiological Physics and Technology","publisher":"Springer Nature","subject":"Physical Therapy, Sports Therapy and Rehabilitation","type":"journal-article","sha":"30e5fb58c61780dff1b8ec0e5771c65afc4f824d"}
+{"doi":"10.1007/s12559-012-9169-9","title":"Biometric Applications Related to Human Beings: There Is Life beyond Security","authors":["Marcos Faundez-Zanuy","Amir Hussain","Jiri Mekyska","Enric Sesa-Nogueras","Enric Monte-Moreno","Anna Esposito","Mohamed Chetouani","Josep Garre-Olmo","Andrew Abel","Zdenek Smekal","Karmele Lopez-de-Ipiña"],"year":"2013","journal":"Cognitive Computation","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"41fd1cd5e78ae710b67fd17ebe52b8e182f4b02e"}
+{"doi":"10.1007/s12630-010-9422-1","title":"Palonosetron-induced migraine-type headache","authors":["Amit Jain"],"year":"2011","journal":"Canadian Journal of Anesthesia/Journal canadien d'anesthésie","publisher":"Springer Nature","subject":"Anesthesiology and Pain Medicine","type":"journal-article","sha":"375bf4bb5419bd7cf05b37d5049257e780682c91"}
+{"doi":"10.1007/s12651-009-0007-9","title":"Vouchers in U.S. vocational training programs: an overview of what we have learned","authors":["Burt S. Barnow"],"year":"2009","journal":"Zeitschrift für ArbeitsmarktForschung","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"7a61a0280e257a9b0d5c501fb1283d4a93605598"}
+{"doi":"10.1007/s12664-010-0079-z","title":"Gastrointestinal stromal tumors: a clinicopathological and immunohistochemical study of 121 cases","authors":["Mukul Vij","Vinita Agrawal","Ashok Kumar","Rakesh Pandey"],"year":"2010","journal":"Indian Journal of Gastroenterology","publisher":"Springer Nature","subject":"Gastroenterology","type":"journal-article","sha":"900c40d08c7dabc92a530be45f730234f662ab8d"}
+{"doi":"10.1007/s40070-013-0009-2","title":"Preface to the EURO Journal on Decision Processes","authors":["Ahti Salo","Marja Makarow"],"year":"2013","journal":"EURO Journal on Decision Processes","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"b32a8e20bdaba0675c678fdb5be3ca8a28d5c4da"}
+{"doi":"10.1016/0002-8703(75)90468-8","title":"Right bundle branch block during transvenous ventricular pacing","authors":["William S. Abernathy","Barry J. Crevey"],"year":"1975","journal":"American Heart Journal","publisher":"Elsevier BV","subject":"Cardiology and Cardiovascular Medicine","type":"journal-article","sha":"166541f1ad1a8fc06290625e7f1c1fb68a86a6d4"}
+{"doi":"10.1016/0014-4827(77)90172-0","title":"The interrelationship of cell growth and division in haploid and diploid cells of Saccharomyces cerevisiae","authors":["Julian Adams"],"year":"1977","journal":"Experimental Cell Research","publisher":"Elsevier BV","subject":"Cell Biology","type":"journal-article","sha":"871e77317d6b18c2ab1945203be0ca35a4b9d9a3"}
+{"doi":"10.1016/0014-4886(68)90080-0","title":"A functional analysis of sensory units innervating epiglottis and larynx","authors":["Arthur T. Storey"],"year":"1968","journal":"Experimental Neurology","publisher":"Elsevier BV","subject":"Developmental Neuroscience","type":"journal-article","sha":"2ff3fdda6892f35b447e6137e93f3029299518e0"}
+{"doi":"10.1016/0019-1035(91)90020-t","title":"Possible microwave absorption by H2S gas in Uranus' and Neptune's atmospheres","authors":["Imke de Pater","Paul N. Romani","Sushil K. Atreya"],"year":"1991","journal":"Icarus","publisher":"Elsevier BV","subject":"Space and Planetary Science","type":"journal-article","sha":"53ae41789a3664b57548561f8bcacf3a50f6cd95"}
+{"doi":"10.1016/0031-9163(65)90846-2","title":"Fourier-transform spectroscopy using holographic imaging without computing and with stationary interferometers","authors":["G.W. Stroke","A.T. Funkhouser"],"year":"1965","journal":"Physics Letters","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"52af2ac5d98e47e8707db0a456aaa259ceee1d66"}
+{"doi":"10.1016/0038-092x(87)90036-3","title":"Concentration profile in the gradient zone of small solar ponds","authors":["J. Srinivasan","Abhijit Guha"],"year":"1987","journal":"Solar Energy","publisher":"Elsevier BV","subject":"Renewable Energy, Sustainability and the Environment","type":"journal-article","sha":"cd4103ed2ca6b97d13595264b517dbb8be09c23f"}
+{"doi":"10.1016/0039-6028(71)90102-6","title":"Spectrophotometric determination of the optical properties of an adsorbed oxygen layer on gold","authors":["D.M. Kolb","J.D.E. McIntyre"],"year":"1971","journal":"Surface Science","publisher":"Elsevier BV","subject":"Materials Chemistry","type":"journal-article","sha":"949b283c8a0291c0742f035d9aab977626198068"}
+{"doi":"10.1016/0049-3848(89)90134-5","title":"Clotting activation and impairment of fibrinolysis in malignancy","authors":["E. Rocha","J.A. Páramo","F.J. Fernández","B. Cuesta","M. Hernández","M.J. Paloma","J. Rifón"],"year":"1989","journal":"Thrombosis Research","publisher":"Elsevier BV","subject":"Hematology","type":"journal-article","sha":"9c6af76859e0b1866f8fd6f68da1db6ca4ac78ef"}
+{"doi":"10.1016/0148-9062(95)00043-7","title":"Finite element analysis of the modified ring test for determining mode I fracture toughness","authors":["M.P. Fischer","D. Elsworth","R.B. Alley","T. Engelder"],"year":"1996","journal":"International Journal of Rock Mechanics and Mining Sciences & Geomechanics Abstracts","publisher":"Elsevier BV","subject":"Engineering(all)","type":"journal-article","sha":"eee3e6ea5bb67273912af58887227618fc30e3a9"}
+{"doi":"10.1016/0165-1765(94)00621-8","title":"An experimental test for gender differences in beneficent behavior","authors":["Gary E. Bolton","Elena Katok"],"year":"1995","journal":"Economics Letters","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"fe8694ee57d4c302e471dba32d3659fd0450bdc6"}
+{"doi":"10.1016/0165-5876(87)90052-8","title":"Non-organic stridor in children","authors":["Michael J. LaRouere","Charles F. Koopnann"],"year":"1987","journal":"International Journal of Pediatric Otorhinolaryngology","publisher":"Elsevier BV","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"3f9085123e1880ba6365abe6f0f40e059142f4f7"}
+{"doi":"10.1016/0168-583x(94)96196-4","title":"Aggregation of Frenkel defects under irradiation: a mesoscopic approach","authors":["W. Soppe","E. Kotomin"],"year":"1994","journal":"Nuclear Instruments and Methods in Physics Research Section B: Beam Interactions with Materials and Atoms","publisher":"Elsevier BV","subject":"Nuclear and High Energy Physics","type":"journal-article","sha":"abe1b49d7f715627d3f7c422255991a2273c64eb"}
+{"doi":"10.1016/0301-0104(87)80180-5","title":"IR dissociation of ammonia clusters","authors":["M. Snels","R. Fantoni","R. Sanders","W.Leo Meerts"],"year":"1987","journal":"Chemical Physics","publisher":"Elsevier BV","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"4d68c35017144b932f88635cb55faa84e898c468"}
+{"doi":"10.1016/0370-2693(94)91395-1","title":"Determination of the radiative decay width of the ηc meson","authors":["H. Albrecht","T. Hamacher","R.P. Hofmann","T. Kirchhoff","R. Mankel","A. Nau","S. Nowak","D. Reβing","H. Schröder","H.D. Schulz","M. Walter","R. Wurth","C. Hast","H. Kapitza","H. Kolanoski","A. Kosche","A. Lange","A. Lindner","M. Schieber","T. Siegmund","B. Spaan","H. Thurn","D. Töpfer","D. Wegener","P. Eckstein","M. Schmidtler","M. Schramm","K.R. Schubert","R. Schwierz","R. Waldi","K. Reim","H. Wegener","R. Eckmann","H. Kuipers","O. Mai","R. Mundt","T. Oest","R. Reiner","W. Schmidt-Parzefall","J. Stiewe","S. Werner","K. Ehret","W. Hofmann","A. Hüpper","K.T. Knöpfle","J. Spengler","P. Krieger","D.B. MacFarlane","J.D. Prentice","P.R.B. Saull","K. Tzamariudaki","R.G. Van de Water","T.-S. Yoon","C. Frankl","M. Schneider","S. Weseler","G. Kernel","P. Kržan","E. KrižniÄ","T. Podobnik","T. Živko","V. Balagura","I. Belyaev","S. Chechelnitsky","M. Danilov","A. Droutskoy","Yu. Gershtein","A. Golutvin","I. Korolko","G. Kostina","D. Litvintsev","V. Lubimov","P. Pakhlov","S. Semenov","A. Snizhko","I. Tichomirov","Yu. Zaitsev"],"year":"1994","journal":"Physics Letters B","publisher":"Elsevier BV","subject":"Nuclear and High Energy Physics","type":"journal-article","sha":"daa7b1259a2b7697272f4d9c2165b0f49d411ee8"}
+{"doi":"10.1016/0888-3270(88)90025-8","title":"On-line severity assessment of bearing damage via defect sensitive resonance identification and matched filtering","authors":["C. James Li","S.M. Wu"],"year":"1988","journal":"Mechanical Systems and Signal Processing","publisher":"Elsevier BV","subject":"Control and Systems Engineering","type":"journal-article","sha":"85e973e01c378aa42da7ac7d6ea9f7dbdea239c1"}
+{"doi":"10.1016/0893-6080(95)00092-5","title":"Space-variant active vision: Definition, overview and examples","authors":["Eric L. Schwartz","Douglas N. Greve","Giorgio Bonmassar"],"year":"1995","journal":"Neural Networks","publisher":"Elsevier BV","subject":"Cognitive Neuroscience","type":"journal-article","sha":"2f95e02f6c3b252de0fd7f8e67e3a62f56c0a8d3"}
+{"doi":"10.1016/0927-6513(94)00056-2","title":"Characterization and removal of extra lattice species in faujasites","authors":["Michael Stockenhuber","J.A. Lercher"],"year":"1995","journal":"Microporous Materials","publisher":"Elsevier BV","subject":"Engineering(all)","type":"journal-article","sha":"12d47e1c00070ddd4a00fe3e41b149d986368859"}
+{"doi":"10.1016/j.actamat.2005.01.016","title":"Drag effect of triple junctions on grain boundary and grain growth kinetics in aluminium","authors":["D. Mattissen","D.A. Molodov","L.S. Shvindlerman","G. Gottstein"],"year":"2005","journal":"Acta Materialia","publisher":"Elsevier BV","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"3b077490a775f72b58758a5bfe3e867936b19323"}
+{"doi":"10.1016/j.actatropica.2006.10.008","title":"Activity of azithromycin or erythromycin in combination with antimalarial drugs against multidrug-resistant Plasmodium falciparum in vitro","authors":["S. Nakornchai","P. Konthiang"],"year":"2006","journal":"Acta Tropica","publisher":"Elsevier BV","subject":"Parasitology","type":"journal-article","sha":"ce7e77d79cecc829afa035d742b43915914ce462"}
+{"doi":"10.1016/j.actbio.2010.09.038","title":"Mathematical modeling of degradation for bulk-erosive polymers: Applications in tissue engineering scaffolds and drug delivery systems","authors":["Yuhang Chen","Shiwei Zhou","Qing Li"],"year":"2011","journal":"Acta Biomaterialia","publisher":"Elsevier BV","subject":"Biotechnology","type":"journal-article","sha":"32b049910bb77ab8663af2698fc5449f7e3d49fd"}
+{"doi":"10.1016/j.ajodo.2005.04.001","title":"Orthodontics in 3 millennia. Chapter 3: The professionalization of orthodontics","authors":["Norman Wahl"],"year":"2005","journal":"American Journal of Orthodontics and Dentofacial Orthopedics","publisher":"Elsevier BV","subject":"Orthodontics","type":"journal-article","sha":"a05cc1d6546f5db7b93623f3325262f90bc7a534"}
+{"doi":"10.1016/j.amjsurg.2005.06.011","title":"An update of sentinel lymph node mapping in patients with ductal carcinoma in situ","authors":["Caren Wilkie","Laura White","Elisabeth Dupont","Alan Cantor","Charles E. Cox"],"year":"2005","journal":"The American Journal of Surgery","publisher":"Elsevier BV","subject":"Surgery","type":"journal-article","sha":"e673ccedcc6244856b494445b28326f88dcaa75c"}
+{"doi":"10.1016/j.aquaeng.2004.09.007","title":"Electric stunning of trout: power reduction using a two-stage stun","authors":["Jeff Lines","Steve Kestin"],"year":"2005","journal":"Aquacultural Engineering","publisher":"Elsevier BV","subject":"Aquatic Science","type":"journal-article","sha":"37970ff7ab8f7133b6200ae82398a27b436c0ec1"}
+{"doi":"10.1016/j.aquatox.2005.05.017","title":"Effects of water hardness on the physiological responses to chronic waterborne silver exposure in early life stages of rainbow trout (Oncorhynchus mykiss)","authors":["T.P. Morgan","C.M. Guadagnolo","M. Grosell","C.M. Wood"],"year":"2005","journal":"Aquatic Toxicology","publisher":"Elsevier BV","subject":"Aquatic Science","type":"journal-article","sha":"84d914fbb768e85f751e4143871999b9a3e5ff93"}
+{"doi":"10.1016/j.artmed.2008.11.006","title":"Modelling treatment effects in a clinical Bayesian network using Boolean threshold functions","authors":["Stefan Visscher","Peter J.F. Lucas","Carolina A.M. Schurink","Marc J.M. Bonten"],"year":"2009","journal":"Artificial Intelligence in Medicine","publisher":"Elsevier BV","subject":"Medicine (miscellaneous)","type":"journal-article","sha":"930cf107d9826ef14efe776f073198eb72a7a47d"}
+{"doi":"10.1016/j.autcon.2010.05.003","title":"Proactive problem-solver for construction","authors":["Wen-der Yu","Jyh-bin Yang","Judy C.R. Tseng","Shen-jung Liu","Ji-wei Wu"],"year":"2010","journal":"Automation in Construction","publisher":"Elsevier BV","subject":"Control and Systems Engineering","type":"journal-article","sha":"93bad2054e1d64a6eb4b5ae3bf23b30a53042e67"}
+{"doi":"10.1016/j.avb.2006.01.007","title":"Transforming a flawed policy: A call to revive psychology and science in domestic violence research and practice","authors":["Donald G. Dutton","Kenneth Corvo"],"year":"2006","journal":"Aggression and Violent Behavior","publisher":"Elsevier BV","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"84d2a156f953276433d689711914203b541b32d1"}
+{"doi":"10.1016/j.bandl.2010.11.006","title":"Brain activations associated with sign production using word and picture inputs in deaf signers","authors":["Zhiguo Hu","Wenjing Wang","Hongyan Liu","Danling Peng","Yanhui Yang","Kuncheng Li","John X. Zhang","Guosheng Ding"],"year":"2011","journal":"Brain and Language","publisher":"Elsevier BV","subject":"Speech and Hearing","type":"journal-article","sha":"18e94c8f5ff6672737ebb18b4a2a5dc11ee23938"}
+{"doi":"10.1016/j.bbalip.2011.11.006","title":"Dynamics of arachidonic acid mobilization by inflammatory cells","authors":["Alma M. Astudillo","David Balgoma","María A. Balboa","Jesús Balsinde"],"year":"2012","journal":"Biochimica et Biophysica Acta (BBA) - Molecular and Cell Biology of Lipids","publisher":"Elsevier BV","subject":"Cell Biology","type":"journal-article","sha":"359dec969545283d8a587bb7a13cb8bca6625c0e"}
+{"doi":"10.1016/j.bbrc.2007.05.110","title":"Effects of genetic polymorphisms of UCP2 and UCP3 on very low calorie diet-induced body fat reduction in Korean female subjects","authors":["Yoosik Yoon","Byung Lae Park","Min Ho Cha","Kil Soo Kim","Hyun Sub Cheong","Yoo Hyun Choi","Hyoung Doo Shin"],"year":"2007","journal":"Biochemical and Biophysical Research Communications","publisher":"Elsevier BV","subject":"Biophysics","type":"journal-article","sha":"3423c92d5d89568abb80775c1b4c99736019e0c5"}
+{"doi":"10.1016/j.biocon.2012.06.021","title":"Spatial distribution of wind turbines is crucial for the survival of red kite populations","authors":["Michael Schaub"],"year":"2012","journal":"Biological Conservation","publisher":"Elsevier BV","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"21bca26de097b8f054aae69fec0c4001c4496407"}
+{"doi":"10.1016/j.biopsycho.2005.11.014","title":"Toward understanding respiratory sinus arrhythmia: Relations to cardiac vagal tone, evolution and biobehavioral functions","authors":["Paul Grossman","Edwin W. Taylor"],"year":"2007","journal":"Biological Psychology","publisher":"Elsevier BV","subject":"Neuroscience(all)","type":"journal-article","sha":"6468a1c4c35e36e4cbd4855bfc37e053c52f2e83"}
+{"doi":"10.1016/j.bmc.2013.02.062","title":"Synthesis and biology of oligoethylene glycol linked naphthoxylosides","authors":["Karin Holmqvist","Andrea Persson","Richard Johnsson","Johanna Löfgren","Katrin Mani","Ulf Ellervik"],"year":"2013","journal":"Bioorganic & Medicinal Chemistry","publisher":"Elsevier BV","subject":"Medicine(all)","type":"journal-article","sha":"be7acd921dee2277c792f331f6a25aeff0e4b91b"}
+{"doi":"10.1016/j.bone.2009.08.049","title":"Orthodontic movement of impacted cuspid in fibrodysplastic bone: A case report","authors":["Giuseppe Colella","Angelo Itro","Letizia Perillo","Rosangela Cannavale"],"year":"2010","journal":"Bone","publisher":"Elsevier BV","subject":"Physiology","type":"journal-article","sha":"cee2ea38de1f469a6667e25947a2eaedcf5492e3"}
+{"doi":"10.1016/j.brainres.2011.12.005","title":"DTI reveals structural differences in white matter tracts between bilingual and monolingual children","authors":["Seyede Ghazal Mohades","Esli Struys","Peter Van Schuerbeek","Katrien Mondt","Piet Van De Craen","Robert Luypaert"],"year":"2012","journal":"Brain Research","publisher":"Elsevier BV","subject":"Developmental Biology","type":"journal-article","sha":"47d523cb18a2bf4f18ccd6cfda5ca71903cc7288"}
+{"doi":"10.1016/j.cam.2006.02.058","title":"State-dependent symplecticity and area preserving numerical methods","authors":["Felice Iavernaro","Donato Trigiante"],"year":"2007","journal":"Journal of Computational and Applied Mathematics","publisher":"Elsevier BV","subject":"Applied Mathematics","type":"journal-article","sha":"cddebc14a4480c21aca2ee2057b85b6cd97d1cc1"}
+{"doi":"10.1016/j.camwa.2007.01.013","title":"Lanchester’s equations in three dimensions","authors":["Christina Spradlin","Greg Spradlin"],"year":"2007","journal":"Computers & Mathematics with Applications","publisher":"Elsevier BV","subject":"Modelling and Simulation","type":"journal-article","sha":"2cb62e2222040f0e6ff928fac3f598047347277e"}
+{"doi":"10.1016/j.carbpol.2005.08.010","title":"Self-assembled nanoparticles based on linoleic-acid modified chitosan: Stability and adsorption of trypsin","authors":["C LIU","X CHEN","H PARK"],"year":"2005","journal":"Carbohydrate Polymers","publisher":"Elsevier BV","subject":"Organic Chemistry","type":"journal-article","sha":"58eec93b38646d81d9bc1ee93380fde3137d16ca"}
+{"doi":"10.1016/j.cattod.2006.01.010","title":"Sulphated AlMCM-41: Mesoporous solid Brønsted acid catalyst for dibenzoylation of biphenyl","authors":["Ng Eng Poh","Hadi Nur","Mohd Nazlan Mohd Muhid","Halimaton Hamdan"],"year":"2006","journal":"Catalysis Today","publisher":"Elsevier BV","subject":"Chemistry(all)","type":"journal-article","sha":"90ee57d508a4acca3208c223b9e93f762c8624f2"}
+{"doi":"10.1016/j.cell.2007.11.019","title":"Induction of Pluripotent Stem Cells from Adult Human Fibroblasts by Defined Factors","authors":["Kazutoshi Takahashi","Koji Tanabe","Mari Ohnuki","Megumi Narita","Tomoko Ichisaka","Kiichiro Tomoda","Shinya Yamanaka"],"year":"2007","journal":"Cell","publisher":"Elsevier BV","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"80a38c171da4ef9598ae1a0ffdf6f69511b8f715"}
+{"doi":"10.1016/j.ces.2013.02.020","title":"pH influence on oxygen mass transfer coefficient in a bubble column. Individual characterization of kL and a","authors":["A. Ferreira","P. Cardoso","J.A. Teixeira","F. Rocha"],"year":"2013","journal":"Chemical Engineering Science","publisher":"Elsevier BV","subject":"Industrial and Manufacturing Engineering","type":"journal-article","sha":"00b30086409bb9c5deb1d92d60b5be641e4688bf"}
+{"doi":"10.1016/j.chaos.2013.07.013","title":"Lévy flights in human behavior and cognition","authors":["Andrea Baronchelli","Filippo Radicchi"],"year":"2013","journal":"Chaos, Solitons & Fractals","publisher":"Elsevier BV","subject":"Mathematics(all)","type":"journal-article","sha":"2837fd59bb8e792b397ae5643d0f3b51f8e66c3a"}
+{"doi":"10.1016/j.clinph.2007.06.011","title":"Neural correlates of stopping and self-reported impulsivity","authors":["Marieke M. Lansbergen","Koen B.E. Böcker","Evelijne M. Bekker","J. Leon Kenemans"],"year":"2007","journal":"Clinical Neurophysiology","publisher":"Elsevier BV","subject":"Physiology (medical)","type":"journal-article","sha":"29d21d871eab9b16fdbe40ebe6aae59d90d45f6d"}
+{"doi":"10.1016/j.cma.2005.08.020","title":"Critical state plasticity. Part VI: Meso-scale finite element simulation of strain localization in discrete granular materials","authors":["Ronaldo I. Borja","José E. Andrade"],"year":"2006","journal":"Computer Methods in Applied Mechanics and Engineering","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"df0fb045f0f1cdb503ca9b945a3d1b0a2fb6f36e"}
+{"doi":"10.1016/j.cognition.2006.10.011","title":"Attentional modulation of masked repetition and categorical priming in young and older adults","authors":["Ludovic Fabre","Patrick Lemaire","Jonathan Grainger"],"year":"2007","journal":"Cognition","publisher":"Elsevier BV","subject":"Linguistics and Language","type":"journal-article","sha":"bd00dae009aa118aff8a8f2aacd85cd114431630"}
+{"doi":"10.1016/j.comcom.2009.08.010","title":"On the applicability of available bandwidth estimation techniques and tools","authors":["Cesar D. Guerrero","Miguel A. Labrador"],"year":"2010","journal":"Computer Communications","publisher":"Elsevier BV","subject":"Computer Networks and Communications","type":"journal-article","sha":"1861f48021815c373bd81c64fbba026a2e98c5c5"}
+{"doi":"10.1016/j.comcom.2012.06.011","title":"Greening wireless communications: Status and future directions","authors":["Pablo Serrano","Antonio de la Oliva","Paul Patras","Vincenzo Mancuso","Albert Banchs"],"year":"2012","journal":"Computer Communications","publisher":"Elsevier BV","subject":"Computer Networks and Communications","type":"journal-article","sha":"e3c8a84b1e93bda5585645c2869e388ef27fdc9d"}
+{"doi":"10.1016/j.compfluid.2015.12.007","title":"Assessment of turbulence model performance: Severe acceleration with large integral length scales","authors":["Xiaoyu Yang","Paul G. Tucker"],"year":"2016","journal":"Computers & Fluids","publisher":"Elsevier BV","subject":"Engineering(all)","type":"journal-article","sha":"5180a903533f026d72147743ac3915af6028dd46"}
+{"doi":"10.1016/j.compstruct.2012.10.039","title":"Numerical simulation of Lamb wave propagation in metallic foam sandwich structures: a parametric study","authors":["Seyed Mohammad Hossein Hosseini","Abdolreza Kharaghani","Christoph Kirsch","Ulrich Gabbert"],"year":"2013","journal":"Composite Structures","publisher":"Elsevier BV","subject":"Civil and Structural Engineering","type":"journal-article","sha":"0fb9409afe8154563f8530d8e4f330e49a9fbe53"}
+{"doi":"10.1016/j.conbuildmat.2005.07.008","title":"Durability of masonry systems: A laboratory study","authors":["G. Cultrone","E. Sebastián","M. Ortega Huertas"],"year":"2007","journal":"Construction and Building Materials","publisher":"Elsevier BV","subject":"Materials Science(all)","type":"journal-article","sha":"3321de8121a8f4904459491bff77b8c81c672da0"}
+{"doi":"10.1016/j.concog.2012.03.012","title":"Alerting and orienting of attention without visual awareness","authors":["Shena Lu","Yongchun Cai","Mowei Shen","Ying Zhou","Shihui Han"],"year":"2012","journal":"Consciousness and Cognition","publisher":"Elsevier BV","subject":"Experimental and Cognitive Psychology","type":"journal-article","sha":"ef459dbe36d5e1ea4d208003b4723407f1e9e92c"}
+{"doi":"10.1016/j.csi.2003.12.003","title":"Design and implementation of a mobile database for Java phones","authors":["Eric Jui-Lin Lu","Yung-Yuan Cheng"],"year":"2004","journal":"Computer Standards & Interfaces","publisher":"Elsevier BV","subject":"Hardware and Architecture","type":"journal-article","sha":"a47b6b2f94e27321a60d5216b50002a6a86a0279"}
+{"doi":"10.1016/j.cub.2006.12.041","title":"Human Visual System Integrates Color Signals along a Motion Trajectory","authors":["Shin'ya Nishida","Junji Watanabe","Ichiro Kuriki","Toyotaro Tokimoto"],"year":"2007","journal":"Current Biology","publisher":"Elsevier BV","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"2ef0bfa7c7b0bff2b8179fba07bc11dbb0d9e729"}
+{"doi":"10.1016/j.cub.2014.05.027","title":"Sleep: A Biological Stimulus from Our Nearest Celestial Neighbor?","authors":["Vladyslav V. Vyazovskiy","Russell G. Foster"],"year":"2014","journal":"Current Biology","publisher":"Elsevier BV","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"e3c8f9e3e55ea877222d479e90b5d746e6b92ba9"}
+{"doi":"10.1016/j.cub.2014.08.046","title":"Motor, not visual, encoding of potential reach targets","authors":["Brandie M. Stewart","Jason P. Gallivan","Lee A. Baugh","J. Randall Flanagan"],"year":"2014","journal":"Current Biology","publisher":"Elsevier BV","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"6ca1b6676fb6de84436b00e9ad06d493d1d7fbb6"}
+{"doi":"10.1016/j.diagmicrobio.2010.04.002","title":"In vitro activity of tigecycline and comparators on Acinetobacter spp. isolates collected from patients with bacteremia and MIC change during the Tigecycline Evaluation and Surveillance Trial, 2004 to 2008","authors":["Yun F. (Wayne) Wang","Michael J. Dowzicky"],"year":"2010","journal":"Diagnostic Microbiology and Infectious Disease","publisher":"Elsevier BV","subject":"Microbiology (medical)","type":"journal-article","sha":"5b94e4d237d2f4f5fd608cf0fa1534baaf7b7003"}
+{"doi":"10.1016/j.echo.2006.08.039","title":"American Society of Echocardiography/Society of Cardiovascular Anesthesiologists Recommendations and Guidelines for Continuous Quality Improvement in Perioperative Echocardiography","authors":["Joseph P. Mathew","Kathryn Glas","Christopher A. Troianos","Pamela Sears-Rogan","Robert Savage","Jack Shanewise","Joseph Kisslo","Solomon Aronson","Stanton Shernan"],"year":"2006","journal":"Journal of the American Society of Echocardiography","publisher":"Elsevier BV","subject":"Radiology Nuclear Medicine and imaging","type":"journal-article","sha":"c94d844e8a2d151a9c52307807cf3781f9468e55"}
+{"doi":"10.1016/j.ecolmodel.2014.12.013","title":"Information provision, policy support, and farmers’ adaptive responses against drought: An empirical study in the North China Plain","authors":["Jinxia Wang","Yu Yang","Jikun Huang","Kevin Chen"],"year":"2015","journal":"Ecological Modelling","publisher":"Elsevier BV","subject":"Ecological Modelling","type":"journal-article","sha":"95c625fa375bd44592a89a505a1305f4303dae3d"}
+{"doi":"10.1016/j.ejor.2001.09.001","title":"Componentwise bounds for nearly completely decomposable Markov chains using stochastic comparison and reordering","authors":["Nihal Pekergin","TuÄŸrul Dayar","Denizhan N. Alparslan"],"year":"2005","journal":"European Journal of Operational Research","publisher":"Elsevier BV","subject":"Management Science and Operations Research","type":"journal-article","sha":"049d840c4e2563e97bc9a21eafdaee64f49357a5"}
+{"doi":"10.1016/j.electacta.2011.05.090","title":"Highly flexible supercapacitors with manganese oxide nanosheet/carbon cloth electrode","authors":["Ying-Chu Chen","Yu-Kuei Hsu","Yan-Gu Lin","Yu-Kai Lin","Ying-Ying Horng","Li-Chyong Chen","Kuei-Hsien Chen"],"year":"2011","journal":"Electrochimica Acta","publisher":"Elsevier BV","subject":"Electrochemistry","type":"journal-article","sha":"426bf3b3614659fc002b0a93a249840be130f73f"}
+{"doi":"10.1016/j.engstruct.2014.01.048","title":"Influence of boundary conditions and size effect on the drift capacity of URM walls","authors":["Sarah Petry","Katrin Beyer"],"year":"2014","journal":"Engineering Structures","publisher":"Elsevier BV","subject":"Civil and Structural Engineering","type":"journal-article","sha":"fbb95388581da620d91cb4ffd27c0ac4c7ca11e6"}
+{"doi":"10.1016/j.enpol.2003.12.014","title":"Multi-criteria analysis of alternative-fuel buses for public transportation","authors":["Gwo-Hshiung Tzeng","Cheng-Wei Lin","Serafim Opricovic"],"year":"2005","journal":"Energy Policy","publisher":"Elsevier BV","subject":"Energy(all)","type":"journal-article","sha":"cd43b2c2a2a33e508f01df4fe9480c1a244cebde"}
+{"doi":"10.1016/j.entcs.2007.01.023","title":"Synthesis of Moduli of Uniform Continuity by the Monotone Dialectica Interpretation in the Proof-system MinLog","authors":["Mircea-Dan Hernest"],"year":"2007","journal":"Electronic Notes in Theoretical Computer Science","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"842d8ac31fa1bca3db9e6f694d8343f6f9a1e7c5"}
+{"doi":"10.1016/j.epsl.2005.04.049","title":"Thermal plume models and melt generation in East Africa: A dynamic modeling approach","authors":["Shu-Chuan Lin","Ban-Yuan Kuo","Ling-Yun Chiao","Peter E. van Keken"],"year":"2005","journal":"Earth and Planetary Science Letters","publisher":"Elsevier BV","subject":"Earth and Planetary Sciences (miscellaneous)","type":"journal-article","sha":"16dd01f9b58f8ddc90ab7dff82e4e3d86532406f"}
+{"doi":"10.1016/j.eswa.2007.05.008","title":"Adaptive wavelet network for multiple cardiac arrhythmias recognition","authors":["C LIN","Y DU","T CHEN"],"year":"2008","journal":"Expert Systems with Applications","publisher":"Elsevier BV","subject":"Engineering(all)","type":"journal-article","sha":"08c914766160ce311856a73c1648aea3bb9e2ff0"}
+{"doi":"10.1016/j.eswa.2013.08.027","title":"Location-based grid-index for spatial query processing","authors":["Kwangjin Park"],"year":"2014","journal":"Expert Systems with Applications","publisher":"Elsevier BV","subject":"Engineering(all)","type":"journal-article","sha":"cda765ca96c26e755457955f64795e3f4c3faf07"}
+{"doi":"10.1016/j.fcl.2006.03.005","title":"Ankle Anatomy for the Arthroscopist. Part I: The Portals","authors":["Pau Golanó","Jordi Vega","Luis Pérez-Carro","Víctor Götzens"],"year":"2006","journal":"Foot and Ankle Clinics","publisher":"Elsevier BV","subject":"Surgery","type":"journal-article","sha":"3e8803ffdbed2fe90ede5d24836cfcbb88a2cccc"}
+{"doi":"10.1016/j.fertnstert.2008.12.098","title":"Human embryo twinning with applications in reproductive medicine","authors":["Karl Illmensee","Mike Levanduski","Andrea Vidali","Nabil Husami","Vasilios T. Goudas"],"year":"2010","journal":"Fertility and Sterility","publisher":"Elsevier BV","subject":"Obstetrics and Gynaecology","type":"journal-article","sha":"73c7596a2b0bd851745ac54d4ed28cceefe31472"}
+{"doi":"10.1016/j.fsigen.2006.10.003","title":"DNA Commission of the International Society for Forensic Genetics (ISFG): Recommendations regarding the role of forensic genetics for disaster victim identification (DVI)","authors":["M. Prinz","A. Carracedo","W.R. Mayr","N. Morling","T.J. Parsons","A. Sajantila","R. Scheithauer","H. Schmitter","P.M. Schneider"],"year":"2007","journal":"Forensic Science International: Genetics","publisher":"Elsevier BV","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"deb292212a04799c016cf189cf6a63e9327488d8"}
+{"doi":"10.1016/j.gca.2007.05.014","title":"Optically continuous silcrete quartz cements of the St. Peter Sandstone: High precision oxygen isotope analysis by ion microprobe","authors":["Jacque L. Kelly","Bin Fu","Noriko T. Kita","John W. Valley"],"year":"2007","journal":"Geochimica et Cosmochimica Acta","publisher":"Elsevier BV","subject":"Geochemistry and Petrology","type":"journal-article","sha":"9daa0206e43359a1cc4957e603fe4c1bd23f0acd"}
+{"doi":"10.1016/j.gca.2008.02.021","title":"A laser-ablation ICP-MS study of Apollo 15 low-titanium olivine-normative and quartz-normative mare basalts","authors":["Darren W. Schnare","James M.D. Day","Marc D. Norman","Yang Liu","Lawrence A. Taylor"],"year":"2008","journal":"Geochimica et Cosmochimica Acta","publisher":"Elsevier BV","subject":"Geochemistry and Petrology","type":"journal-article","sha":"83712a06f6e65abf584dad268e3425aec6ff313b"}
+{"doi":"10.1016/j.geb.2004.03.003","title":"Stationary equilibria in discounted stochastic games with weakly interacting players","authors":["Ulrich Horst"],"year":"2005","journal":"Games and Economic Behavior","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"76baee4b62be0737b9ca10d73dc8b80dbc115ad7"}
+{"doi":"10.1016/j.gexplo.2012.11.003","title":"Degradation of 2,4-dichlorophenol and coupling into humic matter by oxidative biomimetic catalysis with iron-porphyrin","authors":["Barbara Fontaine","Assunta Nuzzo","Riccardo Spaccini","Alessandro Piccolo"],"year":"2013","journal":"Journal of Geochemical Exploration","publisher":"Elsevier BV","subject":"Economic Geology","type":"journal-article","sha":"a8a47f493c5b7bacda6f4eb814e7158688e5bf41"}
+{"doi":"10.1016/j.gloplacha.2006.11.036","title":"Evaluating digital elevation models for glaciologic applications: An example from Nevado Coropuna, Peruvian Andes","authors":["Adina E. Racoviteanu","William F. Manley","Yves Arnaud","Mark W. Williams"],"year":"2007","journal":"Global and Planetary Change","publisher":"Elsevier BV","subject":"Global and Planetary Change","type":"journal-article","sha":"88515146b39c3f7f7ebb15cf14207c6b2909e0d0"}
+{"doi":"10.1016/j.ijnonlinmec.2005.04.005","title":"A new model for the study of rain-wind-induced vibrations of a simple oscillator","authors":["A.H.P. van der Burgh"," Hartono","A.K. Abramian"],"year":"2006","journal":"International Journal of Non-Linear Mechanics","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"515513a8d7e9b74306a66ce79b1b2f8f4f777caa"}
+{"doi":"10.1016/j.ijnurstu.2012.06.013","title":"A multi-country perspective on nurses’ tasks below their skill level: Reports from domestically trained nurses and foreign trained nurses from developing countries","authors":["Luk Bruyneel","Baoyue Li","Linda Aiken","Emmanuel Lesaffre","Koen Van den Heede","Walter Sermeus"],"year":"2013","journal":"International Journal of Nursing Studies","publisher":"Elsevier BV","subject":"Nursing(all)","type":"journal-article","sha":"9fe650ba76b24ef259a560ee4c9ced0cceaa5cc6"}
+{"doi":"10.1016/j.ijoa.2007.10.004","title":"Maternal expectations and birth-related experiences: a survey of pregnant women of mixed parity from Calcutta, India","authors":["I. Hug","C. Chattopadhyay","G. Roy Mitra","R. Mukherjee Kar Mahapatra","M.C. Schneider"],"year":"2008","journal":"International Journal of Obstetric Anesthesia","publisher":"Elsevier BV","subject":"Obstetrics and Gynaecology","type":"journal-article","sha":"6337954a902904fa37364d9061a04e5a95aed0ca"}
+{"doi":"10.1016/j.ijrefrig.2004.09.004","title":"Experimental study on a continuous adsorption water chiller with novel design","authors":["Y.L. Liu","R.Z. Wang","Z.Z. Xia"],"year":"2005","journal":"International Journal of Refrigeration","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"25ff350f5185c1a73ce356858d4ad6fd88c29fdf"}
+{"doi":"10.1016/j.ijsolstr.2004.03.003","title":"Basis-free representations for the stress rate of isotopic materials","authors":["Guansuo Dui","Yi-chao Chen"],"year":"2004","journal":"International Journal of Solids and Structures","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"4cd92ed9c57264a2571ed4f543a9ae1192f498dc"}
+{"doi":"10.1016/j.infsof.2005.05.007","title":"Comparison of software architecture reverse engineering methods","authors":["C. Stringfellow","C.D. Amory","D. Potnuri","A. Andrews","M. Georg"],"year":"2006","journal":"Information and Software Technology","publisher":"Elsevier BV","subject":"Software","type":"journal-article","sha":"f7be66f7da628006696bdc0702d76dd0c8b1ae06"}
+{"doi":"10.1016/j.ipl.2003.10.005","title":"Finding optimal paths in MREP routing","authors":["Rudolf Fleischer","Mordecai Golin","Chin-Tau Lea","Steven Wong"],"year":"2004","journal":"Information Processing Letters","publisher":"Elsevier BV","subject":"Signal Processing","type":"journal-article","sha":"d7e97c0b43a7ec0eba143be18cfbb8a525c7afc1"}
+{"doi":"10.1016/j.jaac.2015.02.006","title":"Pathways to Suicide-Related Behavior in Offspring of Mothers With Depression: The Role of Offspring Psychopathology","authors":["Gemma Hammerton","Stanley Zammit","Liam Mahedy","Rebecca M. Pearson","Ruth Sellers","Anita Thapar","Stephan Collishaw"],"year":"2015","journal":"Journal of the American Academy of Child & Adolescent Psychiatry","publisher":"Elsevier BV","subject":"Developmental and Educational Psychology","type":"journal-article","sha":"0bd5b0eb5e91e9721cbca3dd64673e00acc6387a"}
+{"doi":"10.1016/j.jallcom.2012.07.089","title":"Nanomechanical properties of GaSe thin films deposited on Si(111) substrates by pulsed laser deposition","authors":["Sheng-Rui Jian","Jenh-Yih Juang","Chih-Wei Luo","Shin-An Ku","Kaung-Hsiung Wu"],"year":"2012","journal":"Journal of Alloys and Compounds","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"95ea4042276e558b89c04abd80d292b35e2688c3"}
+{"doi":"10.1016/j.jbiomech.2004.08.020","title":"Ultrasonic measurement of depth-dependent transient behaviors of articular cartilage under compression","authors":["Y.P. Zheng","H.J. Niu","F.T. Arthur Mak","Y.P. Huang"],"year":"2005","journal":"Journal of Biomechanics","publisher":"Elsevier BV","subject":"Biophysics","type":"journal-article","sha":"c2cda2b48d055ddf227e31d3b4aae758e5da9791"}
+{"doi":"10.1016/j.jbusres.2011.12.019","title":"Little Emperors in the UK: Acculturation and food over time","authors":["Benedetta Cappellini","Dorothy Ai-wan Yen"],"year":"2013","journal":"Journal of Business Research","publisher":"Elsevier BV","subject":"Marketing","type":"journal-article","sha":"f551fd60f6e345da3b03dbc089ed8ae2358f0c81"}
+{"doi":"10.1016/j.jcma.2011.01.038","title":"The benefits of body weight loss on health-related quality of life","authors":["Hsiang-Ju Pan","Beatriz M. Cole","Allan Geliebter"],"year":"2011","journal":"Journal of the Chinese Medical Association","publisher":"Elsevier BV","subject":"Medicine(all)","type":"journal-article","sha":"faf33567051201e084ed73147f8b97a86afc13fe"}
+{"doi":"10.1016/j.jcss.2014.01.001","title":"Rigorously modeling self-stabilizing fault-tolerant circuits: An ultra-robust clocking scheme for systems-on-chip","authors":["Danny Dolev","Matthias Függer","Markus Posch","Ulrich Schmid","Andreas Steininger","Christoph Lenzen"],"year":"2014","journal":"Journal of Computer and System Sciences","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"124eeff9e2c3d7ea060c45a86248b1ae265ec117"}
+{"doi":"10.1016/j.jdeveco.2011.06.001","title":"Do interest rates matter? Credit demand in the Dhaka slums","authors":["Rajeev Dehejia","Heather Montgomery","Jonathan Morduch"],"year":"2012","journal":"Journal of Development Economics","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"367c542b8d4073c808b87d448e14b113d4766bab"}
+{"doi":"10.1016/j.jmps.2003.09.019","title":"Mechanics of the human red blood cell deformed by optical tweezers","authors":["M. Dao","C.T. Lim","S. Suresh"],"year":"2003","journal":"Journal of the Mechanics and Physics of Solids","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"e48169ad11d9224530158b5acf81297f50f43ed5"}
+{"doi":"10.1016/j.jpdc.2006.03.004","title":"Fractal: A mobile code-based framework for dynamic application protocol adaptation","authors":["H. Lufei","W. Shi"],"year":"2006","journal":"Journal of Parallel and Distributed Computing","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"a5bee2b4afb483518f392f2671bb95142ebeb8ff"}
+{"doi":"10.1016/j.jpedsurg.2003.10.016","title":"Ten years of maturation of endoscopic surgery in children. Is the wine good?","authors":["N.M.A Bax"],"year":"2004","journal":"Journal of Pediatric Surgery","publisher":"Elsevier BV","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"ff7eeff4190ccaf23bb3bf83a1d4a3b224d640fd"}
+{"doi":"10.1016/j.jpsychores.2004.11.005","title":"The effects of repeated thermal therapy for two patients with chronic fatigue syndrome","authors":["Akinori Masuda","Takashi Kihara","Tsuyoshi Fukudome","Takuro Shinsato","Shinichi Minagoe","Chuwa Tei"],"year":"2005","journal":"Journal of Psychosomatic Research","publisher":"Elsevier BV","subject":"Clinical Psychology","type":"journal-article","sha":"c6d421bffce8136cc9fa18bed1519115a4284a25"}
+{"doi":"10.1016/j.jpubeco.2007.07.001","title":"Distributional impacts of the Self-Sufficiency Project","authors":["Marianne P. Bitler","Jonah B. Gelbach","Hilary W. Hoynes"],"year":"2008","journal":"Journal of Public Economics","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"50d82d1a300d1a8a86fdbf5aa5069b1be750e1fd"}
+{"doi":"10.1016/j.jpubeco.2008.09.003","title":"The trajectory of wealth in retirement","authors":["David A. Love","Michael G. Palumbo","Paul A. Smith"],"year":"2009","journal":"Journal of Public Economics","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"446ff148066cbca1cfa411a45bf25f5a96f47253"}
+{"doi":"10.1016/j.jpurol.2007.11.007","title":"Testicular torsion: Orchiectomy or orchiopexy?","authors":["Seppo Taskinen","Mervi Taskinen","Risto Rintala"],"year":"2008","journal":"Journal of Pediatric Urology","publisher":"Elsevier BV","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"993a6cccd834585a2804b29a826fb984cd83d603"}
+{"doi":"10.1016/j.jspi.2010.05.010","title":"Nonparametric regression for dependent data in the errors-in-variables problem","authors":["Toshio Honda"],"year":"2010","journal":"Journal of Statistical Planning and Inference","publisher":"Elsevier BV","subject":"Statistics, Probability and Uncertainty","type":"journal-article","sha":"1843b161d067ace3318cf167f2b550ca1e88aa6e"}
+{"doi":"10.1016/j.jval.2011.05.036","title":"Implementing Pharmacoeconomic Guidelines in Latin America: Lessons Learned","authors":["Federico Augustovski","Guillermo Melendez","Alexandre Lemgruber","Michael Drummond"],"year":"2011","journal":"Value in Health","publisher":"Elsevier BV","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"7f1129ded6d82d9098833efeda160cbde393ed4c"}
+{"doi":"10.1016/j.labeco.2010.09.006","title":"Are specific skills an obstacle to labor market adjustment?","authors":["Ana Lamo","Julián Messina","Etienne Wasmer"],"year":"2011","journal":"Labour Economics","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"2b8821a952291b59ab956b134c71fdb9f029f776"}
+{"doi":"10.1016/j.marchem.2006.04.006","title":"Silicon isotopes in spring Southern Ocean diatoms: Large zonal changes despite homogeneity among size fractions","authors":["Damien Cardinal","Nicolas Savoye","Thomas W. Trull","Frank Dehairs","Elzbieta E. Kopczynska","François Fripiat","Jean-Louis Tison","Luc André"],"year":"2007","journal":"Marine Chemistry","publisher":"Elsevier BV","subject":"Chemistry(all)","type":"journal-article","sha":"8c165f73a17388cac53ec899002a5a72bbbbbab0"}
+{"doi":"10.1016/j.marpolbul.2006.04.010","title":"The ecological quality status of the Bay of Seine and the Seine estuary: Use of biotic indices","authors":["Jean-Claude Dauvin","Thierry Ruellet","Nicolas Desroy","Anne-Laure Janson"],"year":"2007","journal":"Marine Pollution Bulletin","publisher":"Elsevier BV","subject":"Aquatic Science","type":"journal-article","sha":"be1d1c75cddbbf75eb92638b24a2773cb40a282b"}
+{"doi":"10.1016/j.math.2008.09.004","title":"Sex differences in the pattern of innominate motion during passive hip abduction and external rotation","authors":["Melanie D. Bussey","Stephan Milosavljevic","Melanie L. Bell"],"year":"2009","journal":"Manual Therapy","publisher":"Elsevier BV","subject":"Physical Therapy, Sports Therapy and Rehabilitation","type":"journal-article","sha":"870c08ced2d2d9fe828ceecac51ce62b5b32c89d"}
+{"doi":"10.1016/j.matlet.2003.11.019","title":"Phase formation and characterization of BaBi2Ta2O9 obtained by mixed oxide procedure","authors":["G.C.C. da Costa","A.Z. Simões","A. Ries","C.R. Foschini","M.A. Zaghete","J.A. Varela"],"year":"2004","journal":"Materials Letters","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"f08e450c8d3d432348ec3de3dde626611c7576f7"}
+{"doi":"10.1016/j.matlet.2006.07.104","title":"Characteristics of III-nitride photodiodes with self-assembled quantum dots","authors":["Liang-Wen Ji","Te-Hua Fang","Sheng-Joue Young","Chi-Chung Liu","Yin-Lai Chai"],"year":"2007","journal":"Materials Letters","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"89fae62792497b43e10a7e6cf597b5a090c6d051"}
+{"doi":"10.1016/j.mce.2004.11.002","title":"KATP-channels in beta-cells in tissue slices are directly modulated by millimolar ATP","authors":["S. Speier","S.-B. Yang","K. Sroka","T. Rose","M. Rupnik"],"year":"2005","journal":"Molecular and Cellular Endocrinology","publisher":"Elsevier BV","subject":"Biochemistry","type":"journal-article","sha":"c0b7ce2ec5f57040ea8dddc15d669411f3ee339b"}
+{"doi":"10.1016/j.mimet.2012.03.009","title":"The first performance report for the Bio-Rad Dx CT/NG/MG assay for simultaneous detection of Chlamydia trachomatis, Neisseria gonorrhoeae and Mycoplasma genitalium in urogenital samples","authors":["Chloé Le Roy","Isabelle Le Hen","Maïthé Clerc","Véronique Arfel","Françoise Normandin","Cécile Bébéar","Bertille de Barbeyrac"],"year":"2012","journal":"Journal of Microbiological Methods","publisher":"Elsevier BV","subject":"Microbiology (medical)","type":"journal-article","sha":"304abcd565e3b5bcc92c012b44b56aa3384c2c44"}
+{"doi":"10.1016/j.neucom.2005.05.013","title":"Speed estimation with propagation maps","authors":["C. Rasche"],"year":"2006","journal":"Neurocomputing","publisher":"Elsevier BV","subject":"Cognitive Neuroscience","type":"journal-article","sha":"9b61c06df88a993f50abcc6011fff54c1e8ab812"}
+{"doi":"10.1016/j.neucom.2010.07.030","title":"The complex local mean decomposition","authors":["Cheolsoo Park","David Looney","Marc M. Van Hulle","Danilo P. Mandic"],"year":"2011","journal":"Neurocomputing","publisher":"Elsevier BV","subject":"Cognitive Neuroscience","type":"journal-article","sha":"701320db14d563f49d0764879594488a35303acc"}
+{"doi":"10.1016/j.neuroimage.2009.09.019","title":"Having a word with yourself: Neural correlates of self-criticism and self-reassurance","authors":["Olivia Longe","Frances A. Maratos","Paul Gilbert","Gaynor Evans","Faye Volker","Helen Rockliff","Gina Rippon"],"year":"2010","journal":"NeuroImage","publisher":"Elsevier BV","subject":"Cognitive Neuroscience","type":"journal-article","sha":"0e84ae34f2942d66accb38777fffbdafdfcd5511"}
+{"doi":"10.1016/j.neuroscience.2008.03.085","title":"Neurotoxic lesions at the ventral mesopontine junction change sleep time and muscle activity during sleep: An animal model of motor disorders in sleep","authors":["Y.-Y. Lai","K.-C. Hsieh","D. Nguyen","J. Peever","J.M. Siegel"],"year":"2008","journal":"Neuroscience","publisher":"Elsevier BV","subject":"Neuroscience(all)","type":"journal-article","sha":"bbdbde1775b8133db0a2ad1afd1f4b89ff7a80fe"}
+{"doi":"10.1016/j.nima.2009.03.080","title":"Advanced numerical simulations of temperature anisotropy instabilities and collective interaction processes in high-intensity bunched ion beams","authors":["Hong Qin","Ronald C. Davidson","Edward A. Startsev"],"year":"2009","journal":"Nuclear Instruments and Methods in Physics Research Section A: Accelerators, Spectrometers, Detectors and Associated Equipment","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"4c5c4cca2cf0f8891b891a4ec4c4d03d89af2623"}
+{"doi":"10.1016/j.nimb.2006.11.068","title":"Comparative study of silicon and germanium sputtering by 1–20keV Ar ions","authors":["V.I. Shulga"],"year":"2007","journal":"Nuclear Instruments and Methods in Physics Research Section B: Beam Interactions with Materials and Atoms","publisher":"Elsevier BV","subject":"Nuclear and High Energy Physics","type":"journal-article","sha":"0e580a14e701dbb8fd49afcd1683dfa91f66529a"}
+{"doi":"10.1016/j.omega.2006.07.004","title":"Knowledge Management and Organizational Learning","authors":["W KING","T CHUNG","M HANEY"],"year":"2008","journal":"Omega","publisher":"Elsevier BV","subject":"Management Science and Operations Research","type":"journal-article","sha":"5f20e07e05dae05bceaeac7b6d21d1d6bb725a8b"}
+{"doi":"10.1016/j.orggeochem.2014.07.006","title":"Airborne hydrocarbon contamination from laboratory atmospheres","authors":["Christian J. Illing","Christian Hallmann","Kristen E. Miller","Roger E. Summons","Harald Strauss"],"year":"2014","journal":"Organic Geochemistry","publisher":"Elsevier BV","subject":"Geochemistry and Petrology","type":"journal-article","sha":"5b27b0b77b47370cbfd776c063ba24eb027ccbb2"}
+{"doi":"10.1016/j.petrol.2010.03.023","title":"Experimental testing and modelling of an industrial insulated pipeline for deep sea application","authors":["Nadège Bouchonneau","Valérie Sauvant-Moynot","Dominique Choqueuse","François Grosjean","Emmanuel Poncet","Dominique Perreux"],"year":"2010","journal":"Journal of Petroleum Science and Engineering","publisher":"Elsevier BV","subject":"Fuel Technology","type":"journal-article","sha":"9eb48fd750b113b0442e771a6f34ad79d5558353"}
+{"doi":"10.1016/j.pharep.2014.02.004","title":"Beneficial role of tamoxifen in experimentally induced cardiac hypertrophy","authors":["Bhoomika M. Patel","Vishal J. Desai"],"year":"2014","journal":"Pharmacological Reports","publisher":"Elsevier BV","subject":"Pharmacology","type":"journal-article","sha":"82ca239889e86a2e2af89cf1e3d6d1549717a27f"}
+{"doi":"10.1016/j.physa.2012.10.025","title":"An exploratory statistical approach to depression pattern identification","authors":["Qing Yi Feng","Frances Griffiths","Nick Parsons","Jane Gunn"],"year":"2013","journal":"Physica A: Statistical Mechanics and its Applications","publisher":"Elsevier BV","subject":"Statistics and Probability","type":"journal-article","sha":"7523d057e767d56df3a620651746bdacd3620758"}
+{"doi":"10.1016/j.physb.2006.12.049","title":"Tailoring the enhanced frequency shift in two-dimensional photonic clusters","authors":["A.A. Asatryan","L.C. Botten","N.A. Nicorovici","R.C. McPhedran","C.M. de Sterke"],"year":"2007","journal":"Physica B: Condensed Matter","publisher":"Elsevier BV","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"c0f596c40a060d81957674f028cd931a262c7df3"}
+{"doi":"10.1016/j.pmcj.2005.01.004","title":"Secure context-sensitive authorization","authors":["Kazuhiro Minami","David Kotz"],"year":"2005","journal":"Pervasive and Mobile Computing","publisher":"Elsevier BV","subject":"Computer Science (miscellaneous)","type":"journal-article","sha":"7e9a4b04b7dc27d2deb819ea0982d6fb101ec83e"}
+{"doi":"10.1016/j.pmcj.2012.05.004","title":"Designing on-site: Facilitating participatory contextual architecture with mobile phones","authors":["Mikael B. Skov","Jesper Kjeldskov","Jeni Paay","Niels Husted","Jacob Nørskov","Kenneth Pedersen"],"year":"2013","journal":"Pervasive and Mobile Computing","publisher":"Elsevier BV","subject":"Computer Science (miscellaneous)","type":"journal-article","sha":"cf8640291b4fa77011b9fda161f7c1e8ed07b3af"}
+{"doi":"10.1016/j.pnpbp.2005.08.006","title":"Limbic system mechanisms of stress regulation: Hypothalamo-pituitary-adrenocortical axis","authors":["James P. Herman","Michelle M. Ostrander","Nancy K. Mueller","Helmer Figueiredo"],"year":"2005","journal":"Progress in Neuro-Psychopharmacology and Biological Psychiatry","publisher":"Elsevier BV","subject":"Biological Psychiatry","type":"journal-article","sha":"b2b806e5639172511d407d179eb2323aa7ad2de0"}
+{"doi":"10.1016/j.polsoc.2009.01.007","title":"Stability, participation and transparency in renewable energy policy: Lessons from Denmark and the United States","authors":["Miguel Mendonça","Stephen Lacey","Frede Hvelplund"],"year":"2009","journal":"Policy and Society","publisher":"Informa UK Limited","subject":"Political Science and International Relations","type":"journal-article","sha":"91ca1537ae9c9c573a81cd8ac5ec5ffbad28881d"}
+{"doi":"10.1016/j.poly.2002.08.001","title":"[Cu(4-oxopyrimidinate)2·nH2O]∞: a robust sodalite type metal-organic framework exhibiting a rich host–guest chemistry","authors":["Elisa Barea","Jorge A.R. Navarro","Juan M. Salas","Norberto Masciocchi","Simona Galli","Angelo Sironi"],"year":"2003","journal":"Polyhedron","publisher":"Elsevier BV","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"9a1109c28d3738f10da562be2e0ac83d1dc11979"}
+{"doi":"10.1016/j.polymer.2004.05.043","title":"Thermal properties, miscibility and specific interactions in comparison of linear and star poly(methyl methacrylate) blend with phenolic","authors":["Chih-Feng Huang","Shiao-Wei Kuo","Han-Ching Lin","Jem-Kun Chen","Yu-Kai Chen","Hongyao Xu","Feng-Chih Chang"],"year":"2004","journal":"Polymer","publisher":"Elsevier BV","subject":"Organic Chemistry","type":"journal-article","sha":"cbd9a1cb400c552f7dfd239658c058f1b6afedae"}
+{"doi":"10.1016/j.pragma.2005.12.002","title":"Size–sound symbolism revisited","authors":["Reuven Tsur"],"year":"2006","journal":"Journal of Pragmatics","publisher":"Elsevier BV","subject":"Linguistics and Language","type":"journal-article","sha":"9283e341dbecdb3dc220c0a0f575c48fc64102c8"}
+{"doi":"10.1016/j.procs.2014.02.019","title":"Web Scripts and Mediation Dialogues as a Quality Factor in the Interaction of the Deaf","authors":["Aline da Silva Alves","Simone Bacellar Leal Ferreira","Viviane Santos de Oliveira Veiga","Ingrid Teixeira Monteiro","Denis Silva da Silveira","Alberto Barbosa Raposo"],"year":"2014","journal":"Procedia Computer Science","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"5d9d5292e9ead0df3f53f80feb644382e7cb016e"}
+{"doi":"10.1016/j.purol.2008.03.002","title":"Les « tumeurs superficielles de vessie » n’existent plus","authors":["J. Irani","S. Bernardini","J.-L. Davin","L. Guy","C. Mazerolles","Christian Pfister","M. Roupret","C. Roy","F. Rozet","F. Saint","C. Théodore","H. Wallerand"],"year":"2008","journal":"Progrès en Urologie","publisher":"Elsevier BV","subject":"Urology","type":"journal-article","sha":"ee01687eda02a6ead84d915ef63a78cb90bf3e2c"}
+{"doi":"10.1016/j.rboe.2015.07.001","title":"The American dream","authors":["Gilberto Luis Camanho"],"year":"2015","journal":"Revista Brasileira de Ortopedia (English Edition)","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"fe1b96869a0019ae1261fcbdf6a50455c5967581"}
+{"doi":"10.1016/j.ress.2009.02.027","title":"Inference in hybrid Bayesian networks","authors":["Helge Langseth","Thomas D. Nielsen","Rafael Rumí","Antonio Salmerón"],"year":"2009","journal":"Reliability Engineering & System Safety","publisher":"Elsevier BV","subject":"Industrial and Manufacturing Engineering","type":"journal-article","sha":"da591c9521b29e7dd5106f020ec7f4ca5ed6ffc5"}
+{"doi":"10.1016/j.sbspro.2014.12.493","title":"Modern Social Welfare in the Light of the Sustainability Model","authors":["Lubov Ivankina","Tatjana Latygovskaya"],"year":"2015","journal":"Procedia - Social and Behavioral Sciences","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"0536d30be726c8d489e8aeef87e98b538ceb79d1"}
+{"doi":"10.1016/j.schres.2010.06.016","title":"The relationship between atypical semantic activation and odd speech in schizotypy across emotionally evocative conditions","authors":["Kyle S. Minor","Alex S. Cohen","Christopher R. Weber","Laura A. Brown"],"year":"2011","journal":"Schizophrenia Research","publisher":"Elsevier BV","subject":"Biological Psychiatry","type":"journal-article","sha":"1ae06871cf35bd24c27950f818d69d66e982cb02"}
+{"doi":"10.1016/j.shpsb.2013.01.001","title":"Clausius versus Sackur–Tetrode entropies","authors":["Thomas Oikonomou","G. Baris Bagci"],"year":"2013","journal":"Studies in History and Philosophy of Science Part B: Studies in History and Philosophy of Modern Physics","publisher":"Elsevier BV","subject":"History and Philosophy of Science","type":"journal-article","sha":"818e8e5feaac15196ae6be7ab3d0d1a9a6f605c8"}
+{"doi":"10.1016/j.spl.2007.01.008","title":"The minimal entropy measure and an Esscher transform in an incomplete market model","authors":["Michael Monoyios"],"year":"2007","journal":"Statistics & Probability Letters","publisher":"Elsevier BV","subject":"Statistics, Probability and Uncertainty","type":"journal-article","sha":"713ad16e9c9c9e67dd9238ff6195b33c428e393e"}
+{"doi":"10.1016/j.susc.2005.10.035","title":"Surface energies of several ceramics with NaCl structure","authors":["W. Liu","X. Liu","W.T. Zheng","Q. Jiang"],"year":"2006","journal":"Surface Science","publisher":"Elsevier BV","subject":"Materials Chemistry","type":"journal-article","sha":"da7b1ce7d8952dad798dba5a97082969dd0eeae4"}
+{"doi":"10.1016/j.tcs.2005.05.011","title":"Structure and complexity of extreme Nash equilibria","authors":["M. Gairing","T. Lücking","M. Mavronicolas","B. Monien","P. Spirakis"],"year":"2005","journal":"Theoretical Computer Science","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"1cfbdafb7230183f919bca17320f948244f6b2d2"}
+{"doi":"10.1016/j.tcs.2010.10.022","title":"Approximation of event probabilities in noisy cellular processes","authors":["Frédéric Didier","Thomas A. Henzinger","Maria Mateescu","Verena Wolf"],"year":"2011","journal":"Theoretical Computer Science","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"9626ee305020010e52fe72ea8e08cfb16a584afa"}
+{"doi":"10.1016/j.tecto.2004.03.022","title":"Response changes of some wells in the mainland subsurface fluid monitoring network of China, due to the September 21, 1999, Ms7.6 Chi-Chi Earthquake","authors":["Fu-qiong Huang","Chun-lin Jian","Yi Tang","Gui-ming Xu","Zhi-hui Deng","Gong-cai Chi"],"year":"2004","journal":"Tectonophysics","publisher":"Elsevier BV","subject":"Earth-Surface Processes","type":"journal-article","sha":"7502714eda9f038f5c2b74e60598573cc081a28c"}
+{"doi":"10.1016/j.tics.2006.01.004","title":"Grasping the difference: what apraxia can tell us about theories of imitation","authors":["Cecilia Heyes","Marcel Brass"],"year":"2006","journal":"Trends in Cognitive Sciences","publisher":"Elsevier BV","subject":"Experimental and Cognitive Psychology","type":"journal-article","sha":"61a85f3f2135b13ffd87faaf15bbac7cd9c77ac8"}
+{"doi":"10.1016/j.tpb.2006.07.006","title":"Evolutionary game dynamics in finite populations with strong selection and weak mutation","authors":["Drew Fudenberg","Martin A. Nowak","Christine Taylor","Lorens A. Imhof"],"year":"2006","journal":"Theoretical Population Biology","publisher":"Elsevier BV","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"4b0f5028ce155fa72ffa80e46e1e086395a66236"}
+{"doi":"10.1016/j.tvjl.2010.02.010","title":"Validation of the Warwick–Edinburgh Mental Well-being Scale (WEMWBS) as an overall indicator of population mental health and well-being in the UK veterinary profession","authors":["David J. Bartram","Ghasem Yadegarfar","Julia M.A. Sinclair","David S. Baldwin"],"year":"2011","journal":"The Veterinary Journal","publisher":"Elsevier BV","subject":"Animal Science and Zoology","type":"journal-article","sha":"0e935f625db1623c4fd32111a6048e61ee71d816"}
+{"doi":"10.1016/j.ultsonch.2008.03.010","title":"Incorporation of peptides in phospholipid aggregates using ultrasound","authors":["Raquel Silva","Collin Little","Helena Ferreira","Artur Cavaco-Paulo"],"year":"2008","journal":"Ultrasonics Sonochemistry","publisher":"Elsevier BV","subject":"Acoustics and Ultrasonics","type":"journal-article","sha":"b46047fe7ccbf2dcda2f63ece7d342a780cb6845"}
+{"doi":"10.1016/j.vetpar.2005.05.073","title":"Detection of Borrelia garinii, Borrelia tanukii and Borrelia sp. closely related to Borrelia valaisiana in Ixodes ticks removed from dogs and cats in Japan","authors":["Hiroko Hiraoka","Yojiro Shimada","Yoshimi Sakata","Malaika Watanabe","Kazuhito Itamoto","Masaru Okuda","Toshiyuki Masuzawa","Hisashi Inokuma"],"year":"2007","journal":"Veterinary Parasitology","publisher":"Elsevier BV","subject":"veterinary(all)","type":"journal-article","sha":"b566252f19c82f90196d9e420dc05a52819497f9"}
+{"doi":"10.1016/j.wocn.2005.06.002","title":"Asymmetric mapping from phonetic to lexical representations in second-language listening","authors":["Anne Cutler","Andrea Weber","Takashi Otake"],"year":"2006","journal":"Journal of Phonetics","publisher":"Elsevier BV","subject":"Speech and Hearing","type":"journal-article","sha":"37bdf04c5ce5303e3ecd9c1ad6e6a8836c6df4f2"}
+{"doi":"10.1016/j.wocn.2007.11.002","title":"Novel second-language words and asymmetric lexical access","authors":["Paola Escudero","Rachel Hayes-Harb","Holger Mitterer"],"year":"2008","journal":"Journal of Phonetics","publisher":"Elsevier BV","subject":"Speech and Hearing","type":"journal-article","sha":"6f433952ce2923c7eb84246b2c852a621584383f"}
+{"doi":"10.1016/j.yebeh.2013.11.013","title":"Neuropsychological abnormalities in children with the Panayiotopoulos syndrome point to parietal lobe dysfunction","authors":["Ricardo Lopes","Mário R. Simões","Alberto J.R. Leal"],"year":"2014","journal":"Epilepsy & Behavior","publisher":"Elsevier BV","subject":"Behavioral Neuroscience","type":"journal-article","sha":"82fb17df19c7142623eb551135f9f924b911e8cf"}
+{"doi":"10.1016/s0006-3207(02)00425-1","title":"A conservation plan for a global biodiversity hotspot—the Cape Floristic Region, South Africa","authors":["R.M Cowling","R.L Pressey","M Rouget","A.T Lombard"],"year":"2003","journal":"Biological Conservation","publisher":"Elsevier BV","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"cd4e3f97d7a0483ee45181ef32021d67b94ec273"}
+{"doi":"10.1016/s0006-3223(98)00316-3","title":"Basic concepts in the study of diseases with complex genetics","authors":["Margit Burmeister"],"year":"1999","journal":"Biological Psychiatry","publisher":"Elsevier BV","subject":"Biological Psychiatry","type":"journal-article","sha":"b904dd12215b01742e6a6afdca2f8f00c0432d08"}
+{"doi":"10.1016/s0009-2614(02)00667-x","title":"Tungsten disulphide coated multi-walled carbon nanotubes","authors":["R.L.D. Whitby","W.K. Hsu","C.B. Boothroyd","H.W. Kroto","D.R.M. Walton"],"year":"2002","journal":"Chemical Physics Letters","publisher":"Elsevier BV","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"e980445cfb1a113095a4a04b0cbb2254a8f2aa68"}
+{"doi":"10.1016/s0012-821x(01)00610-0","title":"U–Th dating of marine isotope stage 7 in Bahamas slope sediments","authors":["Laura F Robinson","Gideon M Henderson","Niall C Slowey"],"year":"2002","journal":"Earth and Planetary Science Letters","publisher":"Elsevier BV","subject":"Earth and Planetary Sciences (miscellaneous)","type":"journal-article","sha":"d2a32e5086fc1e50885b30a51f1c128af3287a72"}
+{"doi":"10.1016/s0022-0396(03)00089-5","title":"Uniqueness/nonuniqueness for nonnegative solutions of second-order parabolic equations of the form ut=Lu+Vu−γup in Rn","authors":["János Engländer","Ross G. Pinsky"],"year":"2003","journal":"Journal of Differential Equations","publisher":"Elsevier BV","subject":"Analysis","type":"journal-article","sha":"30b7f709cc3d9a42b10c29fecda07d9e86c487ed"}
+{"doi":"10.1016/s0024-3795(01)00419-0","title":"Brune sections in the non-stationary case","authors":["Daniel Alpay","Vladimir Bolotnikov","Patrick Dewilde","Aad Dijksma"],"year":"2002","journal":"Linear Algebra and its Applications","publisher":"Elsevier BV","subject":"Geometry and Topology","type":"journal-article","sha":"8d36bf564e0445ea60c8229927750060106e6baa"}
+{"doi":"10.1016/s0028-3932(02)00143-4","title":"Neural correlates of feeling sympathy","authors":["Jean Decety","Thierry Chaminade"],"year":"2003","journal":"Neuropsychologia","publisher":"Elsevier BV","subject":"Experimental and Cognitive Psychology","type":"journal-article","sha":"cccaae1e95ac9611194a6bb12f8f63bea0dce4d2"}
+{"doi":"10.1016/s0034-4257(98)00044-3","title":"Atmospheric Precorrected Differential Absorption Technique to Retrieve Columnar Water Vapor","authors":["Daniel Schläpfer","Christoph C. Borel","Johannes Keller","Klaus I. Itten"],"year":"1998","journal":"Remote Sensing of Environment","publisher":"Elsevier BV","subject":"Computers in Earth Sciences","type":"journal-article","sha":"b205ef2e6ac9dd5b730c3249bfd867703268cd1e"}
+{"doi":"10.1016/s0040-6090(99)01009-3","title":"Short-term degradation behaviors of light emitting diodes made of polyurethane derivative with large permanent dipoles on the side chain","authors":["Hyein Jeong","Dechun Zou","Tetsuo Tsutsui","Chang-Sik Ha"],"year":"2000","journal":"Thin Solid Films","publisher":"Elsevier BV","subject":"Materials Chemistry","type":"journal-article","sha":"acaef51369c7f468f4e600b0661f5e4b7b7a03f7"}
+{"doi":"10.1016/s0042-6989(00)00234-0","title":"Invariance of long-term visual priming to scale, reflection, translation, and hemisphere","authors":["József Fiser","Irving Biederman"],"year":"2001","journal":"Vision Research","publisher":"Elsevier BV","subject":"Ophthalmology","type":"journal-article","sha":"fd93840a73a800d8f66895caf09af8bfab627c16"}
+{"doi":"10.1016/s0045-7825(98)00277-1","title":"A quadratic assumed natural strain curved triangular shell element","authors":["K.Y. Sze","Dan Zhu"],"year":"1999","journal":"Computer Methods in Applied Mechanics and Engineering","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"45effe3e1a56a2d791f00c1ae8730f5c0d7f4f03"}
+{"doi":"10.1016/s0048-7333(99)00087-6","title":"Is public R&D a complement or substitute for private R&D? A review of the econometric evidence","authors":["Paul A. David","Bronwyn H. Hall","Andrew A. Toole"],"year":"2000","journal":"Research Policy","publisher":"Elsevier BV","subject":"Management of Technology and Innovation","type":"journal-article","sha":"95d4b80a3935564369a41b0cf6bab2b2c6d76968"}
+{"doi":"10.1016/s0098-1354(98)80032-x","title":"State and disturbance estimation for nonlinear systems affine in the unmeasured variables","authors":["Michael J. Kurtz","Michael A. Henson"],"year":"1998","journal":"Computers & Chemical Engineering","publisher":"Elsevier BV","subject":"Chemical Engineering(all)","type":"journal-article","sha":"2972f2670c22fcd3f55410085b58efb5d2e4ebfa"}
+{"doi":"10.1016/s0098-3004(97)00085-x","title":"FORTRAN PROGRAMS FOR CALCULATING CONNECTIVITY OF THREE-DIMENSIONAL NUMERICAL MODELS AND FOR RANKING MULTIPLE REALIZATIONS","authors":["Clayton V. Deutsch"],"year":"1998","journal":"Computers & Geosciences","publisher":"Elsevier BV","subject":"Computers in Earth Sciences","type":"journal-article","sha":"efbd0d24d959f8dfa8e11fb71058b57818c229ab"}
+{"doi":"10.1016/s0140-3664(04)00115-x","title":"Protection performance components in MPLS networks","authors":["E CALLE"],"year":"2004","journal":"Computer Communications","publisher":"Elsevier BV","subject":"Computer Networks and Communications","type":"journal-article","sha":"ca904796998dca30a520b2639015eb01f6395852"}
+{"doi":"10.1016/s0141-9331(98)00110-0","title":"Fuzzy logic speed control of an induction motor","authors":["Jaime Fonseca","João L Afonso","Júlio S Martins","Carlos Couto"],"year":"1999","journal":"Microprocessors and Microsystems","publisher":"Elsevier BV","subject":"Computer Networks and Communications","type":"journal-article","sha":"b5d4da316000865c5ece666d6504c583c8528bc8"}
+{"doi":"10.1016/s0142-9612(97)00056-2","title":"Immobilization of invertase in conducting polymer matrices","authors":["F. Selampinar","U. Akbulut","M.Y. Özden","L. Toppare"],"year":"1997","journal":"Biomaterials","publisher":"Elsevier BV","subject":"Biophysics","type":"journal-article","sha":"40d91ff7d41ef3d5421c322abc15e8ef3b845aa8"}
+{"doi":"10.1016/s0163-1047(80)92371-7","title":"Hypophysectomy reduces behavioral activation to morphine in the rat","authors":["R.J. Katz"],"year":"1980","journal":"Behavioral and Neural Biology","publisher":"Elsevier BV","subject":"Physiology","type":"journal-article","sha":"93e3d3d702205a8362059da015da1f70075c1e46"}
+{"doi":"10.1016/s0167-4838(98)00102-2","title":"Protein structure and dynamics at high pressure","authors":["K. Heremans","L. Smeller"],"year":"1998","journal":"Biochimica et Biophysica Acta (BBA) - Protein Structure and Molecular Enzymology","publisher":"Elsevier BV","subject":"Biophysics","type":"journal-article","sha":"afe349731681d0956ea31a5b4eb4a6efc77afdd2"}
+{"doi":"10.1016/s0167-6423(02)00061-8","title":"Restructuring of COBOL/CICS legacy systems","authors":["Alex Sellink","Harry Sneed","Chris Verhoef"],"year":"2002","journal":"Science of Computer Programming","publisher":"Elsevier BV","subject":"Software","type":"journal-article","sha":"8a755d7b29cefeb2c86ca32418b910302cf598d2"}
+{"doi":"10.1016/s0168-6496(00)00066-0","title":"Measuring growth of a phenanthrene-degrading bacterial inoculum in soil with a quantitative competitive polymerase chain reaction method","authors":["E Schwartz"],"year":"2000","journal":"FEMS Microbiology Ecology","publisher":"Oxford University Press (OUP)","subject":"Ecology","type":"journal-article","sha":"fa5e28a6d84ba198a4597bd3958ca043d0d236f5"}
+{"doi":"10.1016/s0168-9525(03)00112-4","title":"Phylogeny for the faint of heart: a tutorial","authors":["Sandra L. Baldauf"],"year":"2003","journal":"Trends in Genetics","publisher":"Elsevier BV","subject":"Genetics","type":"journal-article","sha":"95454927e0a1ccbf6260feb3e08c060b5bd82450"}
+{"doi":"10.1016/s0169-1317(02)00110-2","title":"Aspects of kaolinite characterization and retention of Pb and Cd","authors":["C COLES","R YONG"],"year":"2002","journal":"Applied Clay Science","publisher":"Elsevier BV","subject":"Geology","type":"journal-article","sha":"0a6790d14853df562568cd6ceaa17689cf08a55d"}
+{"doi":"10.1016/s0169-5347(99)01662-6","title":"The panda and the phage: compensatory mutations and the persistence of small populations","authors":["Michael C Whitlock","Sarah P Otto"],"year":"1999","journal":"Trends in Ecology & Evolution","publisher":"Elsevier BV","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"4360ef549be2d8d1072df112b570549f24e8a2d2"}
+{"doi":"10.1016/s0300-2977(97)00072-7","title":"Hepatitis C anno 1997","authors":["S Schalm"],"year":"1997","journal":"The Netherlands Journal of Medicine","publisher":"Elsevier BV","subject":"Internal Medicine","type":"journal-article","sha":"f5ccedd447840d24a1adbdd93c998f8b8fa99c65"}
+{"doi":"10.1016/s0304-3975(02)00071-3","title":"On computing the entropy of cellular automata","authors":["Michele D'amico","Giovanni Manzini","Luciano Margara"],"year":"2003","journal":"Theoretical Computer Science","publisher":"Elsevier BV","subject":"Theoretical Computer Science","type":"journal-article","sha":"f7231202fa47261ae331033d74e2120dc4878a44"}
+{"doi":"10.1016/s0304-4149(97)00046-x","title":"Tracking of signal and its derivatives in Gaussian white noise","authors":["P.-L. Chow","R. Khasminskii","R. Liptser"],"year":"1997","journal":"Stochastic Processes and their Applications","publisher":"Elsevier BV","subject":"Modelling and Simulation","type":"journal-article","sha":"5c4018f4eed9d4662af344dd95847e2e8d52b91f"}
+{"doi":"10.1016/s0360-3016(03)00326-2","title":"Preoperative hyperfractionated accelerated radiotherapy (HART) and concomitant CPT-11 in locally advanced rectal carcinoma: A Phase I study","authors":["Verena Voelter","Roger Stupp","Maurice Matter","Michel Gillet","Hanifa Bouzourene","Serge Leyvraz","Philippe Coucke"],"year":"2003","journal":"International Journal of Radiation Oncology*Biology*Physics","publisher":"Elsevier BV","subject":"Cancer Research","type":"journal-article","sha":"f45150aee38cf4f897370abe5268b4f3ec659f78"}
+{"doi":"10.1016/s0376-7388(03)00179-0","title":"A study of temperature effect on chemical, structural and transport parameters determined for two different regenerated cellulose membranes","authors":["M.I Vázquez","J Benavente"],"year":"2003","journal":"Journal of Membrane Science","publisher":"Elsevier BV","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"e09d4d3ee6b7cea5fd293c079b05b87a131e6eb6"}
+{"doi":"10.1016/s0378-4266(01)00254-0","title":"Bank lending policy, credit scoring and value-at-risk","authors":["Tor Jacobson","Kasper Roszbach"],"year":"2003","journal":"Journal of Banking & Finance","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"ed5fde3673658f99edf896c2389215fb2dfa7db8"}
+{"doi":"10.1016/s0895-4356(02)00520-6","title":"How useful are unpublished data from the Food and Drug Administration in meta-analysis?","authors":["Catherine H MacLean","Sally C Morton","Joshua J Ofman","Elizabeth A Roth","Paul G Shekelle"],"year":"2003","journal":"Journal of Clinical Epidemiology","publisher":"Elsevier BV","subject":"Epidemiology","type":"journal-article","sha":"c816804edb9285b6671b652b132086e5976c1191"}
+{"doi":"10.1016/s0921-4488(00)00152-8","title":"Adrenocorticotrophic hormone and dexamethasone failed to affect milk yield in dairy goats: comparative aspects","authors":["A Shamay","S.J Mabjeesh","F Shapiro","N Silanikove"],"year":"2000","journal":"Small Ruminant Research","publisher":"Elsevier BV","subject":"Food Animals","type":"journal-article","sha":"60af0ad22c4af5f067529bddb2aaf567977eb49d"}
+{"doi":"10.1016/s0921-8009(99)00131-7","title":"Measuring the total economic value of restoring ecosystem services in an impaired river basin: results from a contingent valuation survey","authors":["John Loomis","Paula Kent","Liz Strange","Kurt Fausch","Alan Covich"],"year":"2000","journal":"Ecological Economics","publisher":"Elsevier BV","subject":"Economics and Econometrics","type":"journal-article","sha":"e4186b409dc60dd153ca871250aa5abff43a20f1"}
+{"doi":"10.1016/s0925-2312(02)00856-1","title":"Reliability and bifurcation in neurons driven by multiple sinusoids","authors":["Peter J. Thomas","Paul H.E. Tiesinga","Jean-Marc Fellous","Terrence J. Sejnowski"],"year":"2003","journal":"Neurocomputing","publisher":"Elsevier BV","subject":"Cognitive Neuroscience","type":"journal-article","sha":"ad0d26a13361c8481d8ae842b8d71bd8051dd95f"}
+{"doi":"10.1016/s0925-5273(00)00146-8","title":"Project procurement and disposal decisions: An inventory management model","authors":["Keith A Willoughby"],"year":"2001","journal":"International Journal of Production Economics","publisher":"Elsevier BV","subject":"Management Science and Operations Research","type":"journal-article","sha":"0f810b97278361d3719d9e534e20ad9e9b22ea4f"}
+{"doi":"10.1016/s0926-5805(00)00051-0","title":"The pedagogy of virtual design studios","authors":["Thomas Kvan"],"year":"2001","journal":"Automation in Construction","publisher":"Elsevier BV","subject":"Control and Systems Engineering","type":"journal-article","sha":"4900722ee595e86b617766ec4fd0ed97b0c4e07a"}
+{"doi":"10.1016/s0927-6505(02)00153-6","title":"The information from muon arrival time distributions of high-energy EAS as measured with the KASCADE detector","authors":["T. Antoni","W.D. Apel","A.F. Badea","K. Bekk","A. Bercuci","H. Blümer","H. Bozdog","I.M. Brancus","C. Büttner","A.A. Chilingarian","K. Daumiller","P. Doll","J. Engler","F. Fessler","H.J. Gils","R. Glasstetter","R. Haeusler","A. Haungs","D. Heck","J.R. Hörandel","A. Iwan","K.H. Kampert","H.O. Klages","G. Maier","H.J. Mathes","H.J. Mayer","J. Milke","M. Müller","R. Obenland","J. Oehlschläger","S. Ostapchenko","M. Petcu","H. Rebel","M. Risse","M. Roth","G. Schatz","H. Schieler","J. Scholz","T. Thouw","H. Ulrich","J.H. Weber","A. Weindl","J. Wentz","J. Wochele","J. Zabierowski"],"year":"2003","journal":"Astroparticle Physics","publisher":"Elsevier BV","subject":"Astronomy and Astrophysics","type":"journal-article","sha":"44bfe48de81ca32666068a6003ccb83c25b6e44e"}
+{"doi":"10.1016/s0927-7757(02)00235-2","title":"Adsorption of Cu2+ and Ni2+ on iron oxide and kaolin and its importance on Ni2+ transport in porous media","authors":["Tushar Kanti Sen","S.P. Mahajan","Kartic C. Khilar"],"year":"2002","journal":"Colloids and Surfaces A: Physicochemical and Engineering Aspects","publisher":"Elsevier BV","subject":"Colloid and Surface Chemistry","type":"journal-article","sha":"81d9bbba6495a7da2562105de8a776d829c1129d"}
+{"doi":"10.1016/s0928-4931(01)00395-2","title":"Layer-by-layer self-assembly strategy for template synthesis of nanoscale devices","authors":["N.I Kovtyukhova","B.R Martin","J.K.N Mbindyo","T.E Mallouk","M Cabassi","T.S Mayer"],"year":"2002","journal":"Materials Science and Engineering: C","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"4fefc1aea0f872fd50e7908d5e1611b9e04bdb6f"}
+{"doi":"10.1016/s0928-4931(02)00008-5","title":"Alternative tissue engineering scaffolds based on starch: processing methodologies, morphology, degradation and mechanical properties","authors":["M.E Gomes","J.S Godinho","D Tchalamov","A.M Cunha","R.L Reis"],"year":"2002","journal":"Materials Science and Engineering: C","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"556624c5875da583d687fc682b732dd35fb998e7"}
+{"doi":"10.1016/s0950-5849(99)00060-9","title":"Symbolic path-based protocol verification","authors":["Wen-Chien Liu","Chyan-Goei Chung"],"year":"2000","journal":"Information and Software Technology","publisher":"Elsevier BV","subject":"Software","type":"journal-article","sha":"ec5dde0cae8987b9fdd0ce5277d43c2875afc0e1"}
+{"doi":"10.1016/s1006-706x(10)60175-6","title":"Gas-Particle Flow and Combustion Characteristics of Pulverized Coal Injection in Blast Furnace Raceway","authors":["Sheng-fu ZHANG","Chen-guang BAI","Liang-ying WEN","Gui-bao QIU","Xue-wei LÜ"],"year":"2010","journal":"Journal of Iron and Steel Research, International","publisher":"Elsevier BV","subject":"Materials Chemistry","type":"journal-article","sha":"c3f162ad89e1ae3af93e5703e6ddcc9cc76d9f82"}
+{"doi":"10.1016/s1053-8119(03)00144-7","title":"Posterior probability maps and SPMs","authors":["K.J. Friston","W. Penny"],"year":"2003","journal":"NeuroImage","publisher":"Elsevier BV","subject":"Cognitive Neuroscience","type":"journal-article","sha":"39d983858ba96f01e66f46143a7eb8efe77d55bb"}
+{"doi":"10.1016/s1055-8586(03)00030-1","title":"Fetal surgery for lung lesions, congenital diaphragmatic hernia, and sacrococcygeal teratoma","authors":["N.Scott Adzick","Yoshihiro Kitano"],"year":"2003","journal":"Seminars in Pediatric Surgery","publisher":"Elsevier BV","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"725dbb4224b73bccf9e3e7fd5ee2b28cc07e6906"}
+{"doi":"10.1016/s1369-7021(10)70032-8","title":"Graded cross-links for stronger nanomaterials","authors":["Nicola M. Pugno"],"year":"2010","journal":"Materials Today","publisher":"Elsevier BV","subject":"Mechanical Engineering","type":"journal-article","sha":"6d17d8c0c99b0fac5d7a6719734f337d9e6b7c29"}
+{"doi":"10.1016/s1381-1169(00)00420-9","title":"Dinuclear pincer-palladium(II) complexes and their use as homogeneous or heterogeneous catalyst for the aldol reaction of methyl isocyanoacetate","authors":["Raquel Giménez","Timothy M Swager"],"year":"2001","journal":"Journal of Molecular Catalysis A: Chemical","publisher":"Elsevier BV","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"18c654e234595de3939f758db3ec1714dc848cbb"}
+{"doi":"10.1017/s0004972713000816","title":"WIENER INDEX OF TREES OF GIVEN ORDER AND DIAMETER AT MOST","authors":["SIMON MUKWEMBI","TOMÃÅ  VETRÃK"],"year":"2014","journal":"Bulletin of the Australian Mathematical Society","publisher":"Cambridge University Press (CUP)","subject":"Mathematics(all)","type":"journal-article","sha":"fe431f26dd30e821138d9b22bde4ca1b3c76fbd0"}
+{"doi":"10.1017/s0021932010000106","title":"GENDER IDEOLOGY, SAME-SEX PEER GROUP AFFILIATION AND THE RELATIONSHIP BETWEEN TESTOSTERONE AND DOMINANCE IN ADOLESCENT BOYS AND GIRLS","authors":["HANS VERMEERSCH","GUY T'SJOEN","J. M. KAUFMAN","J. VINCKE","MIEKE VAN HOUTTE"],"year":"2010","journal":"Journal of Biosocial Science","publisher":"Cambridge University Press (CUP)","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"6211db99bf25d5b0ce9c698e4bda8ea27cfb487c"}
+{"doi":"10.1017/s0140525x99002198","title":"A neuron doctrine in the philosophy of neuroscience","authors":["Ian Gold","Daniel Stoljar"],"year":"1999","journal":"Behavioral and Brain Sciences","publisher":"Cambridge University Press (CUP)","subject":"Medicine(all)","type":"journal-article","sha":"0877595e8255463b313b5977fbe27188765ad123"}
+{"doi":"10.1017/s014134730001750x","title":"Differential Effects of Severe Self-injurious Behaviour on the Behaviour of Others","authors":["Scott Hall","Chris Oliver"],"year":"1992","journal":"Behavioural Psychotherapy","publisher":"Cambridge University Press (CUP)","subject":"Psychiatry and Mental health","type":"journal-article","sha":"4b5d5a1d186b4b12f0f42a9c2c3d995cd3ee99c6"}
+{"doi":"10.1017/s136510059700206x","title":"MARKET STRUCTURE, SECURITY PRICES, AND INFORMATIONAL EFFICIENCY","authors":["JENNIFER HUANG","JIANG WANG"],"year":"1997","journal":"Macroeconomic Dynamics","publisher":"Cambridge University Press (CUP)","subject":"Economics and Econometrics","type":"journal-article","sha":"8654b179494575734588266b53fdd550396805ba"}
+{"doi":"10.1017/s1366728912000405","title":"The timing and magnitude of Stroop interference and facilitation in monolinguals and bilinguals","authors":["EMILY L. CODERRE","WALTER J. B. VAN HEUVEN","KATHY CONKLIN"],"year":"2013","journal":"Bilingualism: Language and Cognition","publisher":"Cambridge University Press (CUP)","subject":"Linguistics and Language","type":"journal-article","sha":"25826b46813613a3a068c30a32a563c941c9adb2"}
+{"doi":"10.1017/s1368980012001140","title":"Estimated portion sizes in a school-aged population","authors":["Sumaiya Patel","Avni Vyas","Adnan Custovic","Clare S Murray"],"year":"2012","journal":"Public Health Nutrition","publisher":"Cambridge University Press (CUP)","subject":"Nutrition and Dietetics","type":"journal-article","sha":"80af54fbb45386a64772e403958ac571aa4c2c0f"}
+{"doi":"10.1021/ac000715k","title":"Surface Stoichiometry of Manganin Coatings Prepared by Pulsed Laser Deposition As Described by Laser-Induced Breakdown Spectrometry","authors":["L. M. Cabalín","J. J. Laserna"],"year":"2001","journal":"Analytical Chemistry","publisher":"American Chemical Society (ACS)","subject":"Analytical Chemistry","type":"journal-article","sha":"7675dc9b7d7874916b165045aa2271e9b695356d"}
+{"doi":"10.1021/ac3018263","title":"Precisely Controlled Smart Polymer Scaffold for Nanoscale Manipulation of Biomolecules","authors":["Philipp S. Spuhler","Laura Sola","Xirui Zhang","Margo R. Monroe","Joseph T. Greenspun","Marcella Chiari","M. Selim Ünlü"],"year":"2012","journal":"Analytical Chemistry","publisher":"American Chemical Society (ACS)","subject":"Analytical Chemistry","type":"journal-article","sha":"b98642a157a36a75f928e3a14b896f0dc0d2e3d2"}
+{"doi":"10.1021/bi030170z","title":"NMR Structure of an Archaeal Homologue of Ribonuclease P Protein Rpp29†","authors":["David J. Sidote","David W. Hoffman"],"year":"2003","journal":"Biochemistry","publisher":"American Chemical Society (ACS)","subject":"Biochemistry","type":"journal-article","sha":"4e648ac197a914d7c33f0717caf6775aab208df7"}
+{"doi":"10.1021/bi9921337","title":"Kinetics of Peptide Binding to the Class II MHC Protein I−Ek†","authors":["Peter M. Kasson","Joshua D. Rabinowitz","Lutz Schmitt","Mark M. Davis","Harden M. McConnell"],"year":"2000","journal":"Biochemistry","publisher":"American Chemical Society (ACS)","subject":"Biochemistry","type":"journal-article","sha":"6ee607f00b61606839c0618007b0144fb916160e"}
+{"doi":"10.1021/ci600254x","title":"Comments on “Solvation Parameters. 2. A Simplified Molecular Topology To Generate Easily Optimized Valuesâ€","authors":["Christina Mintz","William E. Acree,","Michael H. Abraham"],"year":"2006","journal":"Journal of Chemical Information and Modeling","publisher":"American Chemical Society (ACS)","subject":"Chemistry(all)","type":"journal-article","sha":"db469456a9b816e49f0d9df1f02f0e72723a53b5"}
+{"doi":"10.1021/cs500168t","title":"Evaluating Electrocatalysts for the Hydrogen Evolution Reaction Using Bipolar Electrode Arrays: Bi- and Trimetallic Combinations of Co, Fe, Ni, Mo, and W","authors":["Stephen E. Fosdick","Sean P. Berglund","C. Buddie Mullins","Richard M. Crooks"],"year":"2014","journal":"ACS Catalysis","publisher":"American Chemical Society (ACS)","subject":"","type":"journal-article","sha":"c0c1f72eecf03b335ff3a8c5381f36e0d58d7cf1"}
+{"doi":"10.1021/j100156a084","title":"Sonoluminescence from alkali-metal salt solutions","authors":["Edward B. Flint","Kenneth S. Suslick"],"year":"1991","journal":"The Journal of Physical Chemistry","publisher":"American Chemical Society (ACS)","subject":"Engineering(all)","type":"journal-article","sha":"409727f7ced48b37f73fd6ae55fcd649798ac0a7"}
+{"doi":"10.1021/ja0357689","title":"Melem (2,5,8-Triamino-tri-s-triazine), an Important Intermediate during Condensation of Melamine Rings to Graphitic Carbon Nitride:  Synthesis, Structure Determination by X-ray Powder Diffractometry, Solid-State NMR, and Theoretical Studies","authors":["Barbara Jürgens","Elisabeth Irran","Jürgen Senker","Peter Kroll","Helen Müller","Wolfgang Schnick"],"year":"2003","journal":"Journal of the American Chemical Society","publisher":"American Chemical Society (ACS)","subject":"Colloid and Surface Chemistry","type":"journal-article","sha":"d785e8b5d186d918411cf42294d848760c3ace24"}
+{"doi":"10.1021/ja044940l","title":"The Electronic Spectrum of the UO2Molecule","authors":["Laura Gagliardi","Michael C. Heaven","Jesper Wisborg Krogh","Björn O. Roos"],"year":"2005","journal":"Journal of the American Chemical Society","publisher":"American Chemical Society (ACS)","subject":"Colloid and Surface Chemistry","type":"journal-article","sha":"49d9d699be752f6013f7ec8596acf514145406bb"}
+{"doi":"10.1021/ja3087054","title":"Atomistic Theory of Ostwald Ripening and Disintegration of Supported Metal Particles under Reaction Conditions","authors":["Runhai Ouyang","Jin-Xun Liu","Wei-Xue Li"],"year":"2013","journal":"Journal of the American Chemical Society","publisher":"American Chemical Society (ACS)","subject":"Colloid and Surface Chemistry","type":"journal-article","sha":"0e668009d3be867dc097d3fd85eb34689f2d3be9"}
+{"doi":"10.1021/jp0513622","title":"Three-State Conical Intersections in Nucleic Acid Bases","authors":["Spiridoula Matsika"],"year":"2005","journal":"The Journal of Physical Chemistry A","publisher":"American Chemical Society (ACS)","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"3248056d9c214290579be25664e1a2938b71310d"}
+{"doi":"10.1021/jp057049h","title":"Morphological Characterization of Self-Assembled Peptide Nucleic Acid Amphiphiles","authors":["Cheryl Lau","Ronit Bitton","Havazelet Bianco-Peled","David G. Schultz","David J. Cookson","Shane T. Grosser","James W. Schneider"],"year":"2006","journal":"The Journal of Physical Chemistry B","publisher":"American Chemical Society (ACS)","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"c7a942237cf69a0fe6e1b8b512a3760f96ca3040"}
+{"doi":"10.1021/nl060413j","title":"Atomic Layer Deposition on Biological Macromolecules:  Metal Oxide Coating of Tobacco Mosaic Virus and Ferritin","authors":["Mato Knez","Anan Kadri","Christina Wege","Ulrich Gösele","Holger Jeske","Kornelius Nielsch"],"year":"2006","journal":"Nano Letters","publisher":"American Chemical Society (ACS)","subject":"Mechanical Engineering","type":"journal-article","sha":"a44188e8c7ea7f5506ed5d2ad686cc88f07fc898"}
+{"doi":"10.1021/nl2029078","title":"Ketjenblack Carbon Supported Amorphous Manganese Oxides Nanowires as Highly Efficient Electrocatalyst for Oxygen Reduction Reaction in Alkaline Solutions","authors":["Jang-Soo Lee","Gi Su Park","Ho Il Lee","Sun Tai Kim","Ruiguo Cao","Meilin Liu","Jaephil Cho"],"year":"2011","journal":"Nano Letters","publisher":"American Chemical Society (ACS)","subject":"Mechanical Engineering","type":"journal-article","sha":"90f1d4fc3979b0b009a7f4a057d8e1be65321026"}
+{"doi":"10.1023/a:1010071921399","title":"","authors":["Manjeet S. Dhatt","Yong H. Kim","Sandip Mukherji"],"year":"1999","journal":"Asia-Pacific Financial Markets","publisher":"Springer Nature","subject":"Finance","type":"journal-article","sha":"98ce00501f3762feea38d2e8b7d24ac4c9d35707"}
+{"doi":"10.1023/b:bioc.0000009495.95589.a7","title":"Herbivory and climatic warming: a Mediterranean outbreaking caterpillar attacks a relict, boreal pine species","authors":["José A. Hódar","Regino Zamora"],"year":"2004","journal":"Biodiversity and Conservation","publisher":"Springer Nature","subject":"Ecology","type":"journal-article","sha":"f167330dfb41f72fb7b3220703cf28e361629473"}
+{"doi":"10.1023/b:frac.0000049501.35598.87","title":"Effects of atoms on brittle fracture","authors":["M. Marder"],"year":"2004","journal":"International Journal of Fracture","publisher":"Springer Nature","subject":"Modelling and Simulation","type":"journal-article","sha":"060745effd7551ea04a085835f4193895be967d2"}
+{"doi":"10.1029/2005ja011574","title":"Modeling the size and shape of Saturn's magnetopause with variable dynamic pressure","authors":["C. S. Arridge","N. Achilleos","M. K. Dougherty","K. K. Khurana","C. T. Russell"],"year":"null","journal":"Journal of Geophysical Research","publisher":"Wiley-Blackwell","subject":"Earth and Planetary Sciences (miscellaneous)","type":"journal-article","sha":"885302ad73ff70e1a8755c757cd0f87cdfa8d4d9"}
+{"doi":"10.1029/94jd01679","title":"Conditional sampling, bursting, and the intermittent structure of sensible heat flux","authors":["Gabriel G. Katul","John Albertson","Marc Parlange","Chia-Ren Chu","Han Stricker"],"year":"null","journal":"Journal of Geophysical Research","publisher":"Wiley-Blackwell","subject":"Earth and Planetary Sciences (miscellaneous)","type":"journal-article","sha":"67f4b03a61267239c769183f9bf393ba01cfa808"}
+{"doi":"10.1029/ja084ia08p04267","title":"Observations of Pc 1-2 waves in the outer magnetosphere","authors":["Stanley M. Kaye","Margaret G. Kivelson"],"year":"1979","journal":"Journal of Geophysical Research: Space Physics","publisher":"Wiley-Blackwell","subject":"Earth and Planetary Sciences (miscellaneous)","type":"journal-article","sha":"3f5edd4c0905c59ab829d5f875629533a40b3667"}
+{"doi":"10.1034/j.1600-0587.2002.250310.x","title":"Gap colonization in the Patagonian semidesert: seed bank and diaspore morphology","authors":["Roberto J. Fernandez","Rodolfo A. Golluscio","Alejandro J. Bisigato","Alberto Soriano"],"year":"2002","journal":"Ecography","publisher":"Wiley-Blackwell","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"944d04e68cf1c5b3a854c5ac11d1307b8015e13a"}
+{"doi":"10.1037/0096-1523.23.6.1813","title":"Dependence by any other name smells just as sweet: Reply to van der Velde and van der Heijden (1997).","authors":["James C. Johnston","Eric Ruthruff","Mark Monheit"],"year":"1997","journal":"Journal of Experimental Psychology: Human Perception and Performance","publisher":"American Psychological Association (APA)","subject":"Experimental and Cognitive Psychology","type":"journal-article","sha":"e65f8c840bd9d0ca6fe2152bf91e66e63ec58a55"}
+{"doi":"10.1038/431401a","title":"The geography of life","authors":["Mark Williamson"],"year":"2004","journal":"Nature","publisher":"Springer Nature","subject":"General","type":"journal-article","sha":"b90ee4df3da8b7d6aa6e4365dbeb5288a193664b"}
+{"doi":"10.1038/cmi.2008.32","title":"Intercellular Trogocytosis Plays an Important Role in Modulation of Immune Responses","authors":["Khawaja Ashfaque Ahmed","Manjunatha Ankathatti Munegowda","Yufeng Xie","Jim Xiang"],"year":"2008","journal":"Cellular and Molecular Immunology","publisher":"Springer Nature","subject":"Immunology","type":"journal-article","sha":"930317934dfdd0e407fa648248bf163da6b55ab1"}
+{"doi":"10.1038/ejcn.2013.221","title":"An introduction to the supplement ‘A practical approach to the nutritional management of children with cerebral palsy’","authors":["F Gottrand","P B Sullivan"],"year":"2013","journal":"European Journal of Clinical Nutrition","publisher":"Springer Nature","subject":"Nutrition and Dietetics","type":"journal-article","sha":"c4a1f27ca9a4973a1734b38c487b446b311bbc22"}
+{"doi":"10.1038/jid.2012.506","title":"“Patchâ€ing Up Our Tumor Signaling Knowledge","authors":["Scott X. Atwood","Ramon J. Whitson","Anthony E. Oro"],"year":"2013","journal":"Journal of Investigative Dermatology","publisher":"Elsevier BV","subject":"Cell Biology","type":"journal-article","sha":"be73f2c5deee58b28d82aa5e2b016b6e63b4f78f"}
+{"doi":"10.1038/mp.2011.31","title":"A critical role for NMDA receptors in parvalbumin interneurons for gamma rhythm induction and behavior","authors":["M Carlén","K Meletis","J H Siegle","J A Cardin","K Futai","D Vierling-Claassen","C Rühlmann","S R Jones","K Deisseroth","M Sheng","C I Moore","L-H Tsai"],"year":"2012","journal":"Molecular Psychiatry","publisher":"Springer Nature","subject":"Molecular Biology","type":"journal-article","sha":"4870ef390d6cf8c51fe0f8bf7fecf47054d8db22"}
+{"doi":"10.1038/mt.2012.58","title":"Spherical Bullet Formation via E-cadherin Promotes Therapeutic Potency of Mesenchymal Stem Cells Derived From Human Umbilical Cord Blood for Myocardial Infarction","authors":["Eun Ju Lee","Sung Jung Park","Soo Kyoung Kang","Gi-Hwan Kim","Hyun-Jae Kang","Sae-Won Lee","Hong Bae Jeon","Hyo-Soo Kim"],"year":"2012","journal":"Molecular Therapy","publisher":"Elsevier BV","subject":"Molecular Medicine","type":"journal-article","sha":"f25ede5132fd7a400ba963d5dfc2b61ebe637490"}
+{"doi":"10.1038/nature02015","title":"Demonstration of conditional gate operation using superconducting charge qubits","authors":["T. Yamamoto","Yu. A. Pashkin","O. Astafiev","Y. Nakamura","J. S. Tsai"],"year":"2003","journal":"Nature","publisher":"Springer Nature","subject":"General","type":"journal-article","sha":"897b373bdc60ef18d77f9df304d88bb4558ba597"}
+{"doi":"10.1038/nature13424","title":"Comparative analysis of the transcriptome across distant species","authors":["Mark B. Gerstein","Joel Rozowsky","Koon-Kiu Yan","Daifeng Wang","Chao Cheng","James B. Brown","Carrie A. Davis","LaDeana Hillier","Cristina Sisu","Jingyi Jessica Li","Baikang Pei","Arif O. Harmanci","Michael O. Duff","Sarah Djebali","Roger P. Alexander","Burak H. Alver","Raymond Auerbach","Kimberly Bell","Peter J. Bickel","Max E. Boeck","Nathan P. Boley","Benjamin W. Booth","Lucy Cherbas","Peter Cherbas","Chao Di","Alex Dobin","Jorg Drenkow","Brent Ewing","Gang Fang","Megan Fastuca","Elise A. Feingold","Adam Frankish","Guanjun Gao","Peter J. Good","Roderic Guigó","Ann Hammonds","Jen Harrow","Roger A. Hoskins","Cédric Howald","Long Hu","Haiyan Huang","Tim J. P. Hubbard","Chau Huynh","Sonali Jha","Dionna Kasper","Masaomi Kato","Thomas C. Kaufman","Robert R. Kitchen","Erik Ladewig","Julien Lagarde","Eric Lai","Jing Leng","Zhi Lu","Michael MacCoss","Gemma May","Rebecca McWhirter","Gennifer Merrihew","David M. Miller","Ali Mortazavi","Rabi Murad","Brian Oliver","Sara Olson","Peter J. Park","Michael J. Pazin","Norbert Perrimon","Dmitri Pervouchine","Valerie Reinke","Alexandre Reymond","Garrett Robinson","Anastasia Samsonova","Gary I. Saunders","Felix Schlesinger","Anurag Sethi","Frank J. Slack","William C. Spencer","Marcus H. Stoiber","Pnina Strasbourger","Andrea Tanzer","Owen A. Thompson","Kenneth H. Wan","Guilin Wang","Huaien Wang","Kathie L. Watkins","Jiayu Wen","Kejia Wen","Chenghai Xue","Li Yang","Kevin Yip","Chris Zaleski","Yan Zhang","Henry Zheng","Steven E. Brenner","Brenton R. Graveley","Susan E. Celniker","Thomas R. Gingeras","Robert Waterston"],"year":"null","journal":"Nature","publisher":"Springer Nature","subject":"General","type":"journal-article","sha":"70c1c270b9501f4c4c17eae8052d7caea821977f"}
+{"doi":"10.1038/nchem.907","title":"Graphene oxide as a chemically tunable platform for optical applications","authors":["Kian Ping Loh","Qiaoliang Bao","Goki Eda","Manish Chhowalla"],"year":"2010","journal":"Nature Chemistry","publisher":"Springer Nature","subject":"Chemistry(all)","type":"journal-article","sha":"f1d434a128c32e5eb27033f699b339423f24786c"}
+{"doi":"10.1038/nclimate2540","title":"Pricing climate risk mitigation","authors":["Joseph E. Aldy"],"year":"null","journal":"Nature Climate Change","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"b0c124b5edc9d6f009bfe0d7544528ca29943670"}
+{"doi":"10.1038/ncomms4739","title":"High-throughput and combinatorial gene expression on a chip for metabolism-induced toxicology screening","authors":["Seok Joon Kwon","Dong Woo Lee","Dhiral A. Shah","Bosung Ku","Sang Youl Jeon","Kusum Solanki","Jessica D. Ryan","Douglas S. Clark","Jonathan S. Dordick","Moo-Yeal Lee"],"year":"null","journal":"Nature Communications","publisher":"Springer Nature","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"cfa9b6ec9635733ce88b10f55bc78fef3f27da4c"}
+{"doi":"10.1038/ni.2563","title":"Receptor interacting protein kinase 2–mediated mitophagy regulates inflammasome activation during virus infection","authors":["Christopher Lupfer","Paul G Thomas","Paras K Anand","Peter Vogel","Sandra Milasta","Jennifer Martinez","Gonghua Huang","Maggie Green","Mondira Kundu","Hongbo Chi","Ramnik J Xavier","Douglas R Green","Mohamed Lamkanfi","Charles A Dinarello","Peter C Doherty","Thirumala-Devi Kanneganti"],"year":"null","journal":"Nature Immunology","publisher":"Springer Nature","subject":"Immunology","type":"journal-article","sha":"c40c38afc8988b23460a3d57eadbc0690d276c5e"}
+{"doi":"10.1038/nn.4047","title":"Genetic variation links creativity to psychiatric disorders","authors":["Matthew C Keller","Peter M Visscher"],"year":"null","journal":"Nature Neuroscience","publisher":"Springer Nature","subject":"Neuroscience(all)","type":"journal-article","sha":"f027644dccfc96081fe34e1ef0f183985131cf4b"}
+{"doi":"10.1038/nphys1847","title":"Observation of high-order harmonic generation in a bulk crystal","authors":["Shambhu Ghimire","Anthony D. DiChiara","Emily Sistrunk","Pierre Agostini","Louis F. DiMauro","David A. Reis"],"year":"2011","journal":"Nature Physics","publisher":"Springer Nature","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"88c97d23f5763189225a8adc73fa8fd2dbb2572d"}
+{"doi":"10.1038/npre.2011.6009","title":"Local statistics in natural scenes predict the saliency of synthetic textures","authors":["Gasper Tkacik","Gasper Tkacik","Jason Prentice","Jonathan Victor","Vijay Balasubramanian"],"year":"null","journal":"Nature Precedings","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"0ddeb395084fa286f0e98dc58e1403a6e6b5cb0e"}
+{"doi":"10.1038/nprot.2011.456","title":"Construction of BIBAC and BAC libraries from a variety of organisms for advanced genomics research","authors":["Hong-Bin Zhang","Chantel F Scheuring","Meiping Zhang","Yang Zhang","Cheng-Cang Wu","Jennifer J Dong","Yaning Li"],"year":"null","journal":"Nature Protocols","publisher":"Springer Nature","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"2d7193e7ce2215d48a26b1566fa0cb814c2df26b"}
+{"doi":"10.1038/nsmb.2108","title":"Mechanism of ubiquitylation by dimeric RING ligase RNF4","authors":["Anna Plechanovová","Ellis G Jaffray","Stephen A McMahon","Kenneth A Johnson","Iva Navrátilová","James H Naismith","Ronald T Hay"],"year":"null","journal":"Nature Structural & Molecular Biology","publisher":"Springer Nature","subject":"Molecular Biology","type":"journal-article","sha":"e87aecd7bdfdac5968dd6b981a49917146ca516f"}
+{"doi":"10.1038/nsmb0909-902","title":"When chromatin meets splicing","authors":["Alberto R Kornblihtt","Ignacio E Schor","Mariano Allo","Benjamin J Blencowe"],"year":"2009","journal":"Nature Structural & Molecular Biology","publisher":"Springer Nature","subject":"Molecular Biology","type":"journal-article","sha":"cc38e0991211fe76b55c9002fb3263182d004ce8"}
+{"doi":"10.1038/pcan.2010.25","title":"Effect of androgen deprivation therapy on the expression of prostate cancer biomarkers MSMB and MSMB-binding protein CRISP3","authors":["A Dahlman","A Edsjö","C Halldén","J L Persson","S W Fine","H Lilja","W Gerald","A Bjartell"],"year":"2010","journal":"Prostate Cancer and Prostatic Diseases","publisher":"Springer Nature","subject":"Urology","type":"journal-article","sha":"c21561a1a7c0dc7f1062c9742c6dc5f0fbbf1785"}
+{"doi":"10.1038/sj.bjc.6604999","title":"Prognostic markers in cancer: the evolution of evidence from single studies to meta-analysis, and beyond","authors":["R D Riley","W Sauerbrei","D G Altman"],"year":"2009","journal":"British Journal of Cancer","publisher":"Springer Nature","subject":"Cancer Research","type":"journal-article","sha":"359e3691c11222bd8dc479bafc2d36f5bb34697e"}
+{"doi":"10.1038/sj.onc.1204919","title":"Regulation of STAT protein synthesis by c-Cbl","authors":["Warren A Blesofsky","Kerri Mowen","Robert M Arduini","Darren P Baker","Maria A Murphy","David D L Bowtell","Michael David"],"year":"2001","journal":"Oncogene","publisher":"Springer Nature","subject":"Genetics","type":"journal-article","sha":"73c1163d676473b130701c64785117d0251a77bd"}
+{"doi":"10.1038/srep01009","title":"Mega-Bites: Extreme jaw forces of living and extinct piranhas (Serrasalmidae)","authors":["Justin R. Grubich","Steve Huskey","Stephanie Crofts","Guillermo Orti","Jorge Porto"],"year":"null","journal":"Scientific Reports","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"d1896d35ab627707fe2a9d4ec79b7209b8edac06"}
+{"doi":"10.1039/c3nr03818a","title":"Redox cycling in nanoporous electrochemical devices","authors":["Martin Hüske","Regina Stockmann","Andreas Offenhäusser","Bernhard Wolfrum"],"year":"null","journal":"Nanoscale","publisher":"Royal Society of Chemistry (RSC)","subject":"","type":"journal-article","sha":"af3c9251d4a51f90d3c7fa36ea87d7f9104a441d"}
+{"doi":"10.1039/c5cc04487a","title":"A facile solvent-free synthesis route for the assembly of a highly CO2selective and H2S tolerant NiSIFSIX metal–organic framework","authors":["Osama Shekhah","Youssef Belmabkhout","Karim Adil","Prashant M. Bhatt","Amy J. Cairns","Mohamed Eddaoudi"],"year":"null","journal":"Chem. Commun.","publisher":"Royal Society of Chemistry (RSC)","subject":"Materials Chemistry","type":"journal-article","sha":"413aea4381568571749ada231ae90868ea64db2e"}
+{"doi":"10.1042/bj0350845","title":"Mechanism of biological nitrogen fixation","authors":["Orville Wyss","C. J. Lind","J. B. Wilson","P. W. Wilson"],"year":"1941","journal":"Biochemical Journal","publisher":"Portland Press Ltd.","subject":"Cell Biology","type":"journal-article","sha":"2eeaafb756aa5d3e4c051a5c2b67ac68b1020458"}
+{"doi":"10.1046/j.1365-2036.2000.00806.x","title":"Pharmacodynamics and kinetics of omeprazole MUPS 20 mg and pantoprazole 40 mg during repeated oral administration in Helicobacter pylori-negative subjects","authors":["W. P. Geus","R. A. A. Mathot","P. G. H. Mulder","C. B. H. W. Lamers"],"year":"2000","journal":"Alimentary Pharmacology and Therapeutics","publisher":"Wiley-Blackwell","subject":"Pharmacology (medical)","type":"journal-article","sha":"24152ff3a336f3d5fc2a4439a9f456ecd728b1ff"}
+{"doi":"10.1046/j.1365-2699.2002.00724.x","title":"Availability of food resources, distribution of invasive species, and conservation of a Hawaiian bird along a gradient of elevation","authors":["Paul C. Banko","Peter T. Oboyski","John W. Slotterback","Steven J. Dougill","Daniel M. Goltz","Luanne Johnson","Megan E. Laut","T. Colleen Murray"],"year":"2002","journal":"Journal of Biogeography","publisher":"Wiley-Blackwell","subject":"Ecology","type":"journal-article","sha":"48b897d30ca56d03e85cd7ca59f86e47455a984e"}
+{"doi":"10.1046/j.1471-8278.2001.00038.x","title":"The isolation of microsatellite loci in the Mediterranean fruitfly Ceratitis capitata (Diptera: Tephritidae) using a biotin/streptavidin enrichment technique","authors":["D. G. Casey","A. M. Burnell"],"year":"null","journal":"Molecular Ecology Notes","publisher":"Wiley-Blackwell","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"d54783dcc722251bdc57a8e774889bb464b2fbc5"}
+{"doi":"10.1046/j.1475-097x.2002.00451.x","title":"Female gender increases stiffness of elastic but not of muscular arteries in type I diabetic patients","authors":["A. Ryden Ahlgren","G. Sundkvist","T. Sandgren","T. Lanne"],"year":"2002","journal":"Clinical Physiology and Functional Imaging","publisher":"Wiley-Blackwell","subject":"Physiology (medical)","type":"journal-article","sha":"f997fdcd75d9f2bdcc0ab6321d4ab4b0d62dc8d0"}
+{"doi":"10.1046/j.1540-8167.2001.00507.x","title":"Incidence and Clinical Significance of Inducible Atrial Tachycardia in Patients with Atrioventricular Nodal Reentrant Tachycardia","authors":["CHRISTIAN STICHERLING","HIROSHI TADA","RADMIRA GREENSTEIN","CHI-WO CHAN","STEVEN P. CHOUGH","ROBERT L. BAKER","KRISTINA WASMER","HAKAN ORAL","FRANK PELOSI","BRADLEY P. KNIGHT","S. ADAM STRICKBERGER","FRED MORADY"],"year":"2001","journal":"Journal of Cardiovascular Electrophysiology","publisher":"Wiley-Blackwell","subject":"Physiology (medical)","type":"journal-article","sha":"d00212db0e4ae4a5709e2b74f6342b171fcb9c33"}
+{"doi":"10.1049/iet-bmt.2012.0071","title":"Motion-based counter-measures to photo attacks in face recognition","authors":["André Anjos","Murali Mohan Chakka","Sébastien Marcel"],"year":"2014","journal":"IET Biometrics","publisher":"Institution of Engineering and Technology (IET)","subject":"","type":"journal-article","sha":"82fbf7fe9b2114b87765e2aa62204b98a44dc89b"}
+{"doi":"10.1051/0004-6361/201014053","title":"OGLE 2008–BLG–290: an accurate measurement of the limb darkening of a galactic bulge K Giant spatially resolved by microlensing","authors":["P. Fouqué","D. Heyrovský","S. Dong","A. Gould","A. Udalski","M. D. Albrow","V. Batista","J.-P. Beaulieu","D. P. Bennett","I. A. Bond","D. M. Bramich","S. Calchi Novati","A. Cassan","C. Coutures","S. Dieters","M. Dominik","D. Dominis Prester","J. Greenhill","K. Horne","U. G. Jørgensen","S. Kozłowski","D. Kubas","C.-H. Lee","J.-B. Marquette","M. Mathiasen","J. Menzies","L. A. G. Monard","S. Nishiyama","I. Papadakis","R. Street","T. Sumi","A. Williams","J. C. Yee","S. Brillant","J. A. R. Caldwell","A. Cole","K. H. Cook","J. Donatowicz","N. Kains","S. R. Kane","R. Martin","K. R. Pollard","K. C. Sahu","Y. Tsapras","J. Wambsganss","D. L. DePoy","B. S. Gaudi","C. Han","C.-U. Lee","B.-G. Park","M. Kubiak","M. K. Szymański","G. Pietrzyński","I. Soszyński","O. Szewczyk","K. Ulaczyk","F. Abe","A. Fukui","K. Furusawa","A. C. Gilmore","J. B. Hearnshaw","Y. Itow","K. Kamiya","P. M. Kilmartin","A. V. Korpela","W. Lin","C. H. Ling","K. Masuda","Y. Matsubara","N. Miyake","Y. Muraki","M. Nagaya","K. Ohnishi","T. Okumura","Y. Perrott","N. J. Rattenbury","To. Saito","T. Sako","S. Sato","L. Skuljan","D. Sullivan","W. Sweatman","P. J. Tristram","A. Allan","M. F. Bode","M. J. Burgdorf","N. Clay","S. N. Fraser","E. Hawkins","E. Kerins","T. A. Lister","C. J. Mottram","E. S. Saunders","C. Snodgrass","I. A. Steele","T. Anguita","V. Bozza","K. Harpsøe","T. C. Hinse","M. Hundertmark","P. Kjærgaard","C. Liebig","L. Mancini","G. Masi","S. Rahvar","D. Ricci","G. Scarpetta","J. Southworth","J. Surdej","C. C. Thöne","A. Riffeser","S. Seitz"," "," "," "," "," "," "],"year":"2010","journal":"Astronomy and Astrophysics","publisher":"EDP Sciences","subject":"Space and Planetary Science","type":"journal-article","sha":"85291039170304dbd08e7676b98c925d112f24d8"}
+{"doi":"10.1051/shsconf/20162302001","title":"GIS-Based Multicriteria Evaluation Approach in Planning Tourism Development Sites in Environmentally Sensitive Areas","authors":["Norhidayah Harun","Narimah Samat"],"year":"2016","journal":"SHS Web of Conferences","publisher":"EDP Sciences","subject":"","type":"journal-article","sha":"d1a88cf2ee5f613865ea355364bd0de5f991e241"}
+{"doi":"10.1053/j.ro.2005.07.001","title":"Letter from the Guest Editor","authors":["Liem T. Bui-Mansfield"],"year":"2005","journal":"Seminars in Roentgenology","publisher":"Elsevier BV","subject":"Radiology Nuclear Medicine and imaging","type":"journal-article","sha":"65cf977a65e4ce01c3794bc6109e2a9c4f322a84"}
+{"doi":"10.1056/nejmoa0806104","title":"Telaprevir with Peginterferon and Ribavirin for Chronic HCV Genotype 1 Infection","authors":["John G. McHutchison","Gregory T. Everson","Stuart C. Gordon","Ira M. Jacobson","Mark Sulkowski","Robert Kauffman","Lindsay McNair","John Alam","Andrew J. Muir"],"year":"2009","journal":"New England Journal of Medicine","publisher":"New England Journal of Medicine (NEJM/MMS)","subject":"Medicine(all)","type":"journal-article","sha":"94dd3afc470b081b977c5b81d70749894fa2dd4c"}
+{"doi":"10.1057/9780230594012_2","title":"Stock Market Development and Economic Growth","authors":["Salvatore Capasso"],"year":"2008","journal":"Domestic Resource Mobilization and Financial Development","publisher":"Springer Nature","subject":"","type":"book-chapter","sha":"98171049ce2a539648e80feb64f8b206a6435165"}
+{"doi":"10.1063/1.3368644","title":"Quantum-classical modeling of photoisomerization of polyatomic molecules","authors":["D. C. Tranca","A. A. Neufeld"],"year":"2010","journal":"The Journal of Chemical Physics","publisher":"AIP Publishing","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"a7071dee344aa04a8dab6cb5112333a7356beaca"}
+{"doi":"10.1063/1.3697816","title":"Resonances in rotationally inelastic scattering of OH(X2Π) with helium and neon","authors":["Koos B. Gubbels","Qianli Ma","Millard H. Alexander","Paul J. Dagdigian","Dick Tanis","Gerrit C. Groenenboom","Ad van der Avoird","Sebastiaan Y. T. van de Meerakker"],"year":"2012","journal":"The Journal of Chemical Physics","publisher":"AIP Publishing","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"47b4c21e49caa748e2239e0ee83de281b866c3f1"}
+{"doi":"10.1063/1.4754660","title":"Semiclassical evaluation of kinetic isotope effects in 13-atomic system","authors":["M. Kryvohuz","R. A. Marcus"],"year":"2012","journal":"The Journal of Chemical Physics","publisher":"AIP Publishing","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"6b1c71773bada90870332a23fd3b957339602535"}
+{"doi":"10.1067/j.cpsurg.2013.01.004","title":"Barrett esophagus: epidemiology, pathogenesis, diagnosis, and management","authors":["David Estores","Vic Velanovich"],"year":"2013","journal":"Current Problems in Surgery","publisher":"Elsevier BV","subject":"Surgery","type":"journal-article","sha":"12ca06aba49eaed25a78515f59b8e61f231091b5"}
+{"doi":"10.1068/p010371","title":"Single Units and Sensation: A Neuron Doctrine for Perceptual Psychology?","authors":["H B Barlow"],"year":"1972","journal":"Perception","publisher":"SAGE Publications","subject":"","type":"journal-article","sha":"16a4b9d3bdd0fdd9703bf787f6c17ca7aed55c3b"}
+{"doi":"10.1071/as98176","title":"A Search for Bright Kuiper Belt Objects","authors":["Michael J. I. Brown","R. L. Webster"],"year":"1998","journal":"Publications of the Astronomical Society of Australia","publisher":"Cambridge University Press (CUP)","subject":"Space and Planetary Science","type":"journal-article","sha":"8f16b112bd2b4392b722eff28e3cdc74c9e2a12a"}
+{"doi":"10.1071/sr14112","title":"Opportunities and constraints for biochar technology in Australian agriculture: looking beyond carbon sequestration","authors":["Balwant Singh","Lynne M. Macdonald","Rai S. Kookana","Lukas van Zwieten","Greg Butler","Stephen Joseph","Anthony Weatherley","Bhawana B. Kaudal","Andrew Regan","Julie Cattle","Feike Dijkstra","Mark Boersma","Stephen Kimber","Alexander Keith","Maryam Esfandbod"],"year":"2014","journal":"Soil Research","publisher":"CSIRO Publishing","subject":"","type":"journal-article","sha":"617ee9b0401e678f5670ede63e4f22fa08a604b5"}
+{"doi":"10.1073/pnas.0602716103","title":"Item memory, source memory, and the medial temporal lobe: Concordant findings from fMRI and memory-impaired patients","authors":["J. J. Gold","C. N. Smith","P. J. Bayley","Y. Shrager","J. B. Brewer","C. E. L. Stark","R. O. Hopkins","L. R. Squire"],"year":"2006","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"1a979e3af440652b8964c05dd6b0c81e5d19d76d"}
+{"doi":"10.1073/pnas.0903478106","title":"Starch as a major integrator in the regulation of plant growth","authors":["R. Sulpice","E.-T. Pyl","H. Ishihara","S. Trenkamp","M. Steinfath","H. Witucka-Wall","Y. Gibon","B. Usadel","F. Poree","M. C. Piques","M. Von Korff","M. C. Steinhauser","J. J. B. Keurentjes","M. Guenther","M. Hoehne","J. Selbig","A. R. Fernie","T. Altmann","M. Stitt"],"year":"2009","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"772838f3fbdae0f0f73b2df85f4ab4c21ac98082"}
+{"doi":"10.1073/pnas.0910592106","title":"Neural correlates of behavior in the moth Manduca sexta in response to complex odors","authors":["J. A. Riffell","H. Lei","J. G. Hildebrand"],"year":"2009","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"9e2ede65a1e0bf32c6fc9470d53fcc12a1b67bae"}
+{"doi":"10.1073/pnas.0911855107","title":"Toward discovery science of human brain function","authors":["B. B. Biswal","M. Mennes","X.-N. Zuo","S. Gohel","C. Kelly","S. M. Smith","C. F. Beckmann","J. S. Adelstein","R. L. Buckner","S. Colcombe","A.-M. Dogonowski","M. Ernst","D. Fair","M. Hampson","M. J. Hoptman","J. S. Hyde","V. J. Kiviniemi","R. Kotter","S.-J. Li","C.-P. Lin","M. J. Lowe","C. Mackay","D. J. Madden","K. H. Madsen","D. S. Margulies","H. S. Mayberg","K. McMahon","C. S. Monk","S. H. Mostofsky","B. J. Nagel","J. J. Pekar","S. J. Peltier","S. E. Petersen","V. Riedl","S. A. R. B. Rombouts","B. Rypma","B. L. Schlaggar","S. Schmidt","R. D. Seidler","G. J. Siegle","C. Sorg","G.-J. Teng","J. Veijola","A. Villringer","M. Walter","L. Wang","X.-C. Weng","S. Whitfield-Gabrieli","P. Williamson","C. Windischberger","Y.-F. Zang","H.-Y. Zhang","F. X. Castellanos","M. P. Milham"],"year":"2010","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"fc3311286b6b171f0eaae9b6c6a70ba5b58d8e36"}
+{"doi":"10.1073/pnas.0912279107","title":"Kudzu (Pueraria montana) invasion doubles emissions of nitric oxide and increases ozone pollution","authors":["J. E. Hickman","S. Wu","L. J. Mickley","M. T. Lerdau"],"year":"2010","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"197de024dfa683ba4e91462bf7ac0e6bf2b1e5af"}
+{"doi":"10.1073/pnas.1031596100","title":"Hessian eigenmaps: Locally linear embedding techniques for high-dimensional data","authors":["D. L. Donoho","C. Grimes"],"year":"2003","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"57a66ac4a4e0a00d2cdee8711ce0a18b49e9f7a2"}
+{"doi":"10.1073/pnas.111134598","title":"Effect of COMT Val108/158 Met genotype on frontal lobe function and risk for schizophrenia","authors":["M. F. Egan","T. E. Goldberg","B. S. Kolachana","J. H. Callicott","C. M. Mazzanti","R. E. Straub","D. Goldman","D. R. Weinberger"],"year":"2001","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"766dd93ccc4631424becd0eed6c3a472513e33ec"}
+{"doi":"10.1073/pnas.1321797111","title":"Designed amyloid fibers as materials for selective carbon dioxide capture","authors":["D. Li","H. Furukawa","H. Deng","C. Liu","O. M. Yaghi","D. S. Eisenberg"],"year":"2014","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"fbc8cbb0b2743374d2966a12836c3d4e6574ae23"}
+{"doi":"10.1073/pnas.91.12.5637","title":"The animal model of human amnesia: long-term memory impaired and short-term memory intact.","authors":["P. Alvarez","S. Zola-Morgan","L. R. Squire"],"year":"1994","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"13f0d3ec6f3a280faa269b32e801921d234ed5ea"}
+{"doi":"10.1073/pnas.97.9.4453","title":"The new animal phylogeny: Reliability and implications","authors":["A. Adoutte","G. Balavoine","N. Lartillot","O. Lespinet","B. Prud'homme","R. de Rosa"],"year":"2000","journal":"Proceedings of the National Academy of Sciences","publisher":"Proceedings of the National Academy of Sciences","subject":"General","type":"journal-article","sha":"eb9b9b19d7dfa85250362558ffc228f45da20ff6"}
+{"doi":"10.1080/01621459.2012.716340","title":"AdaptSPEC: Adaptive Spectral Estimation for Nonstationary Time Series","authors":["Ori Rosen","Sally Wood","David S. Stoffer"],"year":"2012","journal":"Journal of the American Statistical Association","publisher":"Informa UK Limited","subject":"Statistics, Probability and Uncertainty","type":"journal-article","sha":"4ea1c372db36312970ad26be40bd9abffb45e3da"}
+{"doi":"10.1080/1065657x.1998.10701918","title":"Composting Unamended Chicken Manure","authors":["D.L. Elwell","H.M. Keener","D.S. Carey","P.P. Schlak"],"year":"1998","journal":"Compost Science & Utilization","publisher":"Informa UK Limited","subject":"Ecology","type":"journal-article","sha":"0b182c89296be22625f3669d89e43b9aba709534"}
+{"doi":"10.1080/14616700903217471","title":"SALAZAR'S INTERFERENCE IN THE BBC PORTUGUESE SERVICE DURING WORLD WAR II","authors":["Nelson Ribeiro"],"year":"2010","journal":"Journalism Studies","publisher":"Informa UK Limited","subject":"Communication","type":"journal-article","sha":"edfcae4246fe5488d13226623a4afc7d138d2ead"}
+{"doi":"10.1084/jem.20081561","title":"Oncogenesis of T-ALL and nonmalignant consequences of overexpressing intracellular NOTCH1","authors":["Xiaoyu Li","Fotini Gounari","Alexei Protopopov","Khashayarsha Khazaie","Harald von Boehmer"],"year":"2008","journal":"The Journal of Experimental Medicine","publisher":"Rockefeller University Press","subject":"Immunology","type":"journal-article","sha":"0e807dca2f78ddda1e6eb0969764ce275c769e0b"}
+{"doi":"10.1085/jgp.201210946","title":"Loop C and the mechanism of acetylcholine receptor–channel gating","authors":["Prasad Purohit","Anthony Auerbach"],"year":"2013","journal":"The Journal of General Physiology","publisher":"Rockefeller University Press","subject":"Physiology","type":"journal-article","sha":"5fd996c58de49d877f77f378b0415ffc2af5bc9b"}
+{"doi":"10.1086/144385","title":"Note: Remarks on the Spectra of Comets 1941c Paraskevopoulos-De Kock) and 1941d (van Gent).","authors":["C. T. Elvey","P. Swings","H. W. Babcock"],"year":"1942","journal":"The Astrophysical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"611bb08536e765b384fd4d08f41cdddb45f3512a"}
+{"doi":"10.1086/261236","title":"A Simple Theory of International Trade with Multinational Corporations","authors":["Elhanan Helpman"],"year":"1984","journal":"Journal of Political Economy","publisher":"University of Chicago Press","subject":"Economics and Econometrics","type":"journal-article","sha":"0d639942cba43fcda79375eb570b0b0169875b45"}
+{"doi":"10.1086/375461","title":"Orbital and Collisional Evolution of the Irregular Satellites","authors":["David Nesvorn","Jose L. A. Alvarellos","Luke Dones","Harold F. Levison"],"year":"2003","journal":"The Astronomical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"84f7667d2359096e8c69717f58412e8d1ef96f38"}
+{"doi":"10.1086/424533","title":"Tilting Saturn. I. Analytic Model","authors":["William R. Ward","Douglas P. Hamilton"],"year":"2004","journal":"The Astronomical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"21c73610de2394fa0f203f595b62df2ff39a2373"}
+{"doi":"10.1086/427853","title":"The Fine Guidance Sensor Orbit of the G4 Bright Giant HD 173764","authors":["Sidney B. Parsons","Otto G. Franz","Lawrence H. Wasserman"],"year":"2005","journal":"The Astronomical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"bf514fa6dc19106b7bced910e845ad1e675f37fd"}
+{"doi":"10.1086/520489","title":"The Hall Instability of Weakly Ionized, Radially Stratified, Rotating Disks","authors":["Edward Liverts","Michael Mond","Arthur D. Chernin"],"year":"2007","journal":"The Astrophysical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"78405cb92fef6e2fa7f0e3677024e01cb074262e"}
+{"doi":"10.1086/529446","title":"Do Students Care about School Quality? Determinants of Dropout Behavior in Developing Countries","authors":["Eric A. Hanushek","Victor Lavy","Kohtaro Hitomi"],"year":"2008","journal":"Journal of Human Capital","publisher":"University of Chicago Press","subject":"Economics, Econometrics and Finance(all)","type":"journal-article","sha":"ab3d4956579e57df1714894243c70210fbbaa833"}
+{"doi":"10.1088/0004-637x/710/2/1654","title":"DISTRIBUTED FLAMES IN TYPE Ia SUPERNOVAE","authors":["A. J. Aspden","J. B. Bell","S. E. Woosley"],"year":"2010","journal":"The Astrophysical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"aa403b2887b0ef400f382bd3a5ea672b4aee5e8c"}
+{"doi":"10.1088/0004-637x/723/1/425","title":"THE ANGULAR MOMENTUM OF MAGNETIZED MOLECULAR CLOUD CORES: A TWO-DIMENSIONAL-THREE-DIMENSIONAL COMPARISON","authors":["Sami Dib","Patrick Hennebelle","Jaime E. Pineda","Timea Csengeri","Sylvain Bontemps","Edouard Audit","Alyssa A. Goodman"],"year":"2010","journal":"The Astrophysical Journal","publisher":"IOP Publishing","subject":"Space and Planetary Science","type":"journal-article","sha":"cd7c3720973d76373e2d8f960ab9fc37408506de"}
+{"doi":"10.1088/0266-5611/19/3/305","title":"The linear sampling method for non-absorbing penetrable elastic bodies","authors":["Antonios Charalambopoulos","Drossos Gintides","Kiriakie Kiriaki"],"year":"2003","journal":"Inverse Problems","publisher":"IOP Publishing","subject":"Theoretical Computer Science","type":"journal-article","sha":"90010bc519bf52056fd60a89ef534b79f626b73f"}
+{"doi":"10.1088/0953-8984/21/3/035103","title":"Thermophoresis at a charged surface: the role of hydrodynamic slip","authors":["Julien Morthomas","Alois Würger"],"year":"2009","journal":"Journal of Physics: Condensed Matter","publisher":"IOP Publishing","subject":"Materials Science(all)","type":"journal-article","sha":"1ae635422b9092a62baf7daf78c0bf9c03ea5804"}
+{"doi":"10.1088/0954-3899/34/7/s08","title":"The spin structure of the nucleon","authors":["W Vogelsang"],"year":"2007","journal":"Journal of Physics G: Nuclear and Particle Physics","publisher":"IOP Publishing","subject":"Nuclear and High Energy Physics","type":"journal-article","sha":"46e8d05168f7dfe8ba4188bdb137c68af7d95287"}
+{"doi":"10.1090/s0002-9904-1893-00121-3","title":"A bit of mathematical history","authors":["Maxime Bôcher"],"year":"1893","journal":"Bulletin of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"","type":"journal-article","sha":"b2a2e2dc16de9c139f59c7e724959ddb60d77a54"}
+{"doi":"10.1090/s0002-9904-1934-05948-2","title":"Note concerning group postulates","authors":["Raymond Garver"],"year":"1934","journal":"Bulletin of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"","type":"journal-article","sha":"b93e7cb3bcb946f40938837442ec04f333ce5c27"}
+{"doi":"10.1090/s0002-9904-1965-11275-7","title":"Variational methods for nonlinear elliptic eigenvalue problems","authors":["Felix E. Browder"],"year":"1965","journal":"Bulletin of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"","type":"journal-article","sha":"15e36c90d92766af6bf34e705baa8682a42b47ab"}
+{"doi":"10.1090/s0002-9939-04-07392-7","title":"","authors":["Ismail Kombe"],"year":"2004","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"900b1683a8a8fb212958c2730a3268fd38834edd"}
+{"doi":"10.1090/s0002-9939-07-09016-8","title":"On a Littlewood-Paley type inequality","authors":["Olivera Djordjević","Miroslav Pavlović"],"year":"2007","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"4c072ef1b71da7def7bf19056aca9259bf6c3132"}
+{"doi":"10.1090/s0002-9939-1975-0380720-3","title":"A generalization of absolute retracts","authors":["John R. Martin"],"year":"1975","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"cad8307fcb942203e9a929a352d8a2057dcb9743"}
+{"doi":"10.1090/s0002-9939-1976-0420537-5","title":"Continuous mappings from Cantor spaces onto inverse limit spectra","authors":["Alan H. Schoenfeld"],"year":"1976","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"de1abc0d203586a996e3047a6c2345733b5dabe5"}
+{"doi":"10.1090/s0002-9939-1987-0891138-7","title":"The existence of minimal regular local overrings for an arbitrary domain","authors":["Bernard Johnston"],"year":"1987","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"0be36d55c0cff44681c3b3bd49c339867b00aa92"}
+{"doi":"10.1090/s0002-9939-1993-1196168-9","title":"Canonical system on elliptic curves","authors":["Luis A. Piovan"],"year":"1993","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"3f3b73739cc3c2d568d63d921743aed3560add21"}
+{"doi":"10.1090/s0002-9939-96-03125-5","title":"","authors":["Jingyi Chen"],"year":"1996","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"fd3feb3b1279ce41a9cae80215739a09a1cc62b1"}
+{"doi":"10.1090/s0002-9939-99-04721-8","title":"","authors":["Alexander M. Blokh","John C. Mayer","Lex G. Oversteegen"],"year":"1999","journal":"Proceedings of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Applied Mathematics","type":"journal-article","sha":"b132253a7e380237879a9437d9f40dcb9a12b338"}
+{"doi":"10.1090/s0002-9947-1960-0117365-x","title":"The asymptotic expansions for the odd periodic Mathieu functions","authors":["Gertrude Blanch"],"year":"1960","journal":"Transactions of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Mathematics(all)","type":"journal-article","sha":"27cf0da5ba688134f64f1da1982995f0aef4ef94"}
+{"doi":"10.1090/s0002-9947-1966-0196720-1","title":"Decomposable chainable continua","authors":["J. B. Fugate"],"year":"1966","journal":"Transactions of the American Mathematical Society","publisher":"American Mathematical Society (AMS)","subject":"Mathematics(all)","type":"journal-article","sha":"899476c2dc5159a665f4ad95c481b82ef50f0ea5"}
+{"doi":"10.1090/s0025-5718-08-02158-3","title":"Evaluating Jacquet’s $\\\\mathbf {{\\\\rm GL}(n)}$ Whittaker function","authors":["Kevin A. Broughan"],"year":"2008","journal":"Mathematics of Computation","publisher":"American Mathematical Society (AMS)","subject":"Algebra and Number Theory","type":"journal-article","sha":"988634dd20bd765cf6493c1496136d0b59ed6e3e"}
+{"doi":"10.1093/aepp/ppp006","title":"The Adequacy of Speculation in Agricultural Futures Markets: Too Much of a Good Thing?","authors":["D. R. Sanders","S. H. Irwin","R. P. Merrin"],"year":"2010","journal":"Applied Economic Perspectives and Policy","publisher":"Oxford University Press (OUP)","subject":"Economics and Econometrics","type":"journal-article","sha":"2e0859baebaf70574a78ef0f7d7736852edb7066"}
+{"doi":"10.1093/bioinformatics/16.12.1091","title":"Protein domain decomposition using a graph-theoretic approach","authors":["Y. Xu","D. Xu","H. N. Gabow"],"year":"2000","journal":"Bioinformatics","publisher":"Oxford University Press (OUP)","subject":"Statistics and Probability","type":"journal-article","sha":"e720af0627bf52b8a46b94929644d7f296796e46"}
+{"doi":"10.1093/bioinformatics/17.12.1123","title":"A probabilistic method for identifying start codons in bacterial genomes","authors":["B. E. Suzek","M. D. Ermolaeva","M. Schreiber","S. L. Salzberg"],"year":"2001","journal":"Bioinformatics","publisher":"Oxford University Press (OUP)","subject":"Statistics and Probability","type":"journal-article","sha":"4a12a8304b52bf99646e09f2ba6bbc9b51773dfe"}
+{"doi":"10.1093/brain/119.4.1123","title":"The significance of cortical pathology in progressive supranuclear palsy: Clinico-pathological data in 10 cases","authors":["M. Verny","C. Duyckaerts","Y. Agid","J.- J. Hauw"],"year":"1996","journal":"Brain","publisher":"Oxford University Press (OUP)","subject":"Clinical Neurology","type":"journal-article","sha":"7f1625777c760cdffe67748cf56204cdd600b1bf"}
+{"doi":"10.1093/brain/124.4.647","title":"Semantic dementia: relevance to connectionist models of long-term memory","authors":["J. M. J. Murre"],"year":"2001","journal":"Brain","publisher":"Oxford University Press (OUP)","subject":"Clinical Neurology","type":"journal-article","sha":"01d676f052bedae39018ff45c3d3b908f3bb8618"}
+{"doi":"10.1093/cercor/10.7.706","title":"Maintaining and Shifting Attention within Left or Right Hemifield","authors":["R. Vandenberghe","J. Duncan","K. M. Arnell","S. J. Bishop","N. J. Herrod","A. M. Owen","P. S. Minhas","P. Dupont","J. D. Pickard","G. A. Orban"],"year":"2000","journal":"Cerebral Cortex","publisher":"Oxford University Press (OUP)","subject":"Medicine(all)","type":"journal-article","sha":"cc2df68f5e628916d0b49661943309505a07717c"}
+{"doi":"10.1093/hmg/2.5.505","title":"Chromosomal bar codes produced by multicolor fluorescence in situ hybridization with multiple YAC clones and whole chromosome painting probes","authors":["Christoph Lengauer","Michael Speicher","Susanne Popp","Anna Jauch","Masafumi Taniwaki","Ramaiah Nagaraja","Harold C. Riethman","Helen Donis-Keller","Michele D'Urso","David Schlessinger","Thomas Cremer"],"year":"1993","journal":"Human Molecular Genetics","publisher":"Oxford University Press (OUP)","subject":"Genetics(clinical)","type":"journal-article","sha":"ab022189079f98937960858c34c9ae9a34de5786"}
+{"doi":"10.1093/humrep/deg459","title":"High singleton live birth rate following classical ovulation induction in normogonadotrophic anovulatory infertility (WHO 2)","authors":["M. J.C. Eijkemans"],"year":"2003","journal":"Human Reproduction","publisher":"Oxford University Press (OUP)","subject":"Obstetrics and Gynaecology","type":"journal-article","sha":"95f8cbc8239660d8f7b2d79bfd517885286ee2da"}
+{"doi":"10.1093/ijlit/7.1.1","title":"Software piracy and the Doris Day Syndrome: some legal, ethical and social implications of contemporary conceptions of property","authors":["J Couser"],"year":"1999","journal":"International Journal of Law and Information Technology","publisher":"Oxford University Press (OUP)","subject":"Law","type":"journal-article","sha":"9039ac28aadc88bc71411c7d42be377cb10428c9"}
+{"doi":"10.1093/nar/26.2.439","title":"Molecular definition of heterogeneous nuclear ribonucleoprotein R (hnRNP R) using autoimmune antibody: Immunological relationship with hnRNP P","authors":["W. Hassfeld","E. K. L. Chan","D. A. Mathison","D. Portman","G. Dreyfuss","G. Steiner","E. M. Tan"],"year":"1998","journal":"Nucleic Acids Research","publisher":"Oxford University Press (OUP)","subject":"Genetics","type":"journal-article","sha":"f52e97904ccc2725382f93d001631857d2142b20"}
+{"doi":"10.1093/rcfs/cfu007","title":"Long-Term Debt and Hidden Borrowing","authors":["Heski Bar-Isaac","Vicente Cuñat"],"year":"2014","journal":"Review of Corporate Finance Studies","publisher":"Oxford University Press (OUP)","subject":"","type":"journal-article","sha":"8bfa0ee563338f4bc0de961e321232d3adb9d6c5"}
+{"doi":"10.1094/phyto-97-3-0373","title":" Characterization, Genetic Structure, and Pathogenicity of Rhizoctonia spp. Associated with Rice Sheath Diseases in India ","authors":["Parissa Taheri","Sam Gnanamanickam","Monica Höfte"],"year":"2007","journal":"Phytopathology","publisher":"Scientific Societies","subject":"Agronomy and Crop Science","type":"journal-article","sha":"218f91a24872eaa9e2600e5cba12006ce8febf30"}
+{"doi":"10.1097/01.ccm.0000142984.44321.a4","title":"Fluid resuscitation in severe sepsis and septic shock: An evidence-based review","authors":["Jean-Louis Vincent","Herwig Gerlach"],"year":"2004","journal":"Critical Care Medicine","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Critical Care and Intensive Care Medicine","type":"journal-article","sha":"23777308bde73123fae87daf223ae0b5f444843a"}
+{"doi":"10.1097/01.jgp.0000192489.43179.31","title":"Correlates of Self-Rated Successful Aging Among Community-Dwelling Older Adults","authors":["Lori P. Montross","Colin Depp","John Daly","Jennifer Reichstadt","Shahrokh Golshan","David Moore","David Sitzer","Dilip V. Jeste"],"year":"2006","journal":"The American Journal of Geriatric Psychiatry","publisher":"Elsevier BV","subject":"Medicine(all)","type":"journal-article","sha":"a1f3e091595770fb47b610bb58fd4ada90434431"}
+{"doi":"10.1097/01.sa.0000318701.18722.f6","title":"Vasopressin Versus Norepinephrine Infusion in Patients With Septic Shock","authors":[" &NA",""],"year":"2009","journal":"Survey of Anesthesiology","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"","type":"journal-article","sha":"fdc44a26fc090c06171d4e3a33fd4a96278c8a89"}
+{"doi":"10.1097/ajp.0b013e318192be97","title":"Are Patients With Schizophrenia Insensitive to Pain? A Reconsideration of the Question","authors":["Olivier Bonnot","George M. Anderson","David Cohen","Jean Claude Willer","Sylvie Tordjman"],"year":"2009","journal":"The Clinical Journal of Pain","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Anesthesiology and Pain Medicine","type":"journal-article","sha":"62e6b35c8bda9bdf87778536a077c658d3173c57"}
+{"doi":"10.1097/ss.0b013e31814cee60","title":"BOUNDARY LAYER THEORY DESCRIPTION OF SOLUTE TRANSPORT IN SOIL","authors":["Quanjiu Wang","Robert Horton"],"year":"2007","journal":"Soil Science","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Soil Science","type":"journal-article","sha":"c36df3125f1e254cb5627000d63c516bb9a5ad42"}
+{"doi":"10.1098/rspb.2010.1753","title":"Social organization in a flatworm: trematode parasites form soldier and reproductive castes","authors":["R. F. Hechinger","A. C. Wood","A. M. Kuris"],"year":"2011","journal":"Proceedings of the Royal Society B: Biological Sciences","publisher":"The Royal Society","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"3ad7cbb81fafd36776e909783130a73bbe690562"}
+{"doi":"10.1098/rstb.2009.0147","title":"Decision-making during gambling: an integration of cognitive and psychobiological approaches","authors":["L. Clark"],"year":"2010","journal":"Philosophical Transactions of the Royal Society B: Biological Sciences","publisher":"The Royal Society","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"69bc1412ac0c502e9fb81c7439aefd48345b6e21"}
+{"doi":"10.1099/0022-1317-69-10-2563","title":"Minute Virus of Mice Non-structural Protein NS-1 Is Necessary and Sufficient for Trans-activation of the Viral P39 Promoter","authors":["C. Doerig","B. Hirt","P. Beard","J.-P. Antonietti"],"year":"1988","journal":"Journal of General Virology","publisher":"Microbiology Society","subject":"Virology","type":"journal-article","sha":"961c96ea3b100d9eadf063aaeb5ceab46c8b2bee"}
+{"doi":"10.1103/physreva.66.042101","title":"Entanglement generation by adiabatic navigation in the space of symmetric multiparticle states","authors":["Razmik G. Unanyan","Michael Fleischhauer","Nikolay V. Vitanov","Klaas Bergmann"],"year":"null","journal":"Physical Review A","publisher":"American Physical Society (APS)","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"d2037a955d3e90bfea3441ecb460d76aaf1606be"}
+{"doi":"10.1103/physreva.67.063816","title":"Interaction of an atom with a small dispersive and absorptive dielectric body","authors":["Claudia Eberlein","Maciej Janowicz"],"year":"null","journal":"Physical Review A","publisher":"American Physical Society (APS)","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"e15db31475d4f29a5e4c8543bda0156825289beb"}
+{"doi":"10.1103/physreva.82.060302","title":"Simulation of classical thermal states on a quantum computer: A transfer-matrix approach","authors":["Man-Hong Yung","Daniel Nagaj","James D. Whitfield","Alán Aspuru-Guzik"],"year":"null","journal":"Physical Review A","publisher":"American Physical Society (APS)","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"781355f713b126600cc9a1429aabf7b5b9f1d2db"}
+{"doi":"10.1103/physrevb.55.r13424","title":"Coherent acoustic oscillations in metallic nanoparticles generated with femtosecond optical pulses","authors":["M. Nisoli","S. De Silvestri","A. Cavalleri","A. M. Malvezzi","A. Stella","G. Lanzani","P. Cheyssac","R. Kofman"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Condensed Matter Physics","type":"journal-article","sha":"dbb9ff618c33e289046030612a0a1a36d7a4b4f7"}
+{"doi":"10.1103/physrevb.60.r8473","title":"Indium-induced changes in GaN(0001) surface morphology","authors":["John E. Northrup","Jörg Neugebauer"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Condensed Matter Physics","type":"journal-article","sha":"7dbc248ad3a1676d8b350d83257bd7aded0fb935"}
+{"doi":"10.1103/physrevb.69.180402","title":"Switching-mode-dependent magnetic interlayer coupling strength in spin valves and magnetic tunnel junctions","authors":["Y. Pennec","J. Camarero","J. C. Toussaint","S. Pizzini","M. Bonfim","F. Petroff","W. Kuch","F. Offi","K. Fukumoto","F. Nguyen Van Dau","J. Vogel"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"927157219b7690a020e90c12e954ba21cd1625f2"}
+{"doi":"10.1103/physrevb.73.212415","title":"Steplike magnetization of spin chains in a triangular lattice: Monte Carlo simulations","authors":["X. Y. Yao","S. Dong","J.-M. Liu"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"c2944ff3d3756605737f2ab8ea7721f01f23765e"}
+{"doi":"10.1103/physrevb.81.125433","title":"Magnetism of substitutional Co impurities in graphene: Realization of singleπvacancies","authors":["E. J. G. Santos","D. Sánchez-Portal","A. Ayuela"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"1f6c1d96d44b7f4a7fff38045f1e6e9f7be92bc1"}
+{"doi":"10.1103/physrevb.85.224413","title":"Magnetoelectric effects in single crystals of the cubic ferrimagnetic helimagnet Cu2OSeO3","authors":["M. Belesi","I. Rousochatzakis","M. Abid","U. K. Rößler","H. Berger","J.-Ph. Ansermet"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"a85536b544ab227371da3151c853b3bad13ec017"}
+{"doi":"10.1103/physrevb.86.155123","title":"Breakdown of Fermi liquid behavior at the(Ï€,Ï€)=2kFspin-density wave quantum-critical point: The case of electron-doped cuprates","authors":["Dominic Bergeron","Debanjan Chowdhury","Matthias Punk","Subir Sachdev","A.-M. S. Tremblay"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"ac4cf4f5412b3a61b590f6643acbdd3b901c8b47"}
+{"doi":"10.1103/physrevb.89.235417","title":"Discriminating short-range from van der Waals forces using total force data in noncontact atomic force microscopy","authors":["Stefan Kuhn","Philipp Rahe"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"ca9431ff09746a7e046a955d446da9b651378c9a"}
+{"doi":"10.1103/physrevb.92.180102","title":"Solid-liquid interfacial free energy out of equilibrium","authors":["Bingqing Cheng","Gareth A. Tribello","Michele Ceriotti"],"year":"null","journal":"Physical Review B","publisher":"American Physical Society (APS)","subject":"Electronic, Optical and Magnetic Materials","type":"journal-article","sha":"edcc8d694cfb89e3ed8b7598fa7401a175d162b8"}
+{"doi":"10.1103/physreve.63.026215","title":"Stochastic multiresonance in a chaotic map with fractal basins of attraction","authors":["S. Matyjaśkiewicz","A. Krawiecki","J. A. Hołyst","K. Kacperski","W. Ebeling"],"year":"null","journal":"Physical Review E","publisher":"American Physical Society (APS)","subject":"Mathematical Physics","type":"journal-article","sha":"de2f2837ad3197807270c58e1617b107c4db4168"}
+{"doi":"10.1103/physreve.72.011103","title":"Fractional rotational diffusion of rigid dipoles in an asymmetrical double-well potential","authors":["William T. Coffey","Yuri P. Kalmykov","Sergey V. Titov","Jagdish K. Vij"],"year":"null","journal":"Physical Review E","publisher":"American Physical Society (APS)","subject":"Statistics and Probability","type":"journal-article","sha":"41f71b5c93014a12d7ac020ca880fd1913b78d7e"}
+{"doi":"10.1103/physreve.82.031803","title":"Translocation time of periodically forced polymer chains","authors":["Alessandro Fiasconaro","Juan José Mazo","Fernando Falo"],"year":"null","journal":"Physical Review E","publisher":"American Physical Society (APS)","subject":"Statistics and Probability","type":"journal-article","sha":"842382508576bc51de1fc86813a613a25cfee84d"}
+{"doi":"10.1103/physreve.87.013106","title":"Longitudinal viscosity of two-dimensional Yukawa liquids","authors":["Yan Feng","J. Goree","Bin Liu"],"year":"null","journal":"Physical Review E","publisher":"American Physical Society (APS)","subject":"Statistics and Probability","type":"journal-article","sha":"68c42613e9c1f3a78481a1d97ebe191d2eecc93f"}
+{"doi":"10.1103/physreve.88.033002","title":"Pore-scale micro-computed-tomography imaging: Nonwetting-phase cluster-size distribution during drainage and imbibition","authors":["A. Georgiadis","S. Berg","A. Makurat","G. Maitland","H. Ott"],"year":"null","journal":"Physical Review E","publisher":"American Physical Society (APS)","subject":"Statistics and Probability","type":"journal-article","sha":"26f9363b139174594fea956730dc5b4ae2b90fc5"}
+{"doi":"10.1103/physrevlett.106.040503","title":"Sustained Quantum Coherence and Entanglement in the Avian Compass","authors":["Erik M. Gauger","Elisabeth Rieper","John J. L. Morton","Simon C. Benjamin","Vlatko Vedral"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"d2e897e28d196c17d735ed9f539f2193d6fd29b3"}
+{"doi":"10.1103/physrevlett.106.152001","title":"Measurement of the Mass Difference betweentandt¯Quarks","authors":["T. Aaltonen","B. Ãlvarez González","S. Amerio","D. Amidei","A. Anastassov","A. Annovi","J. Antos","G. Apollinari","J. A. Appel","A. Apresyan","T. Arisawa","A. Artikov","J. Asaadi","W. Ashmanskas","B. Auerbach","A. Aurisano","F. Azfar","W. Badgett","A. Barbaro-Galtieri","V. E. Barnes","B. A. Barnett","P. Barria","P. Bartos","M. Bauce","G. Bauer","F. Bedeschi","D. Beecher","S. Behari","G. Bellettini","J. Bellinger","D. Benjamin","A. Beretvas","A. Bhatti","M. Binkley","D. Bisello","I. Bizjak","K. R. Bland","B. Blumenfeld","A. Bocci","A. Bodek","D. Bortoletto","J. Boudreau","A. Boveia","B. Brau","L. Brigliadori","A. Brisuda","C. Bromberg","E. Brucken","M. Bucciantonio","J. Budagov","H. S. Budd","S. Budd","K. Burkett","G. Busetto","P. Bussey","A. Buzatu","C. Calancha","S. Camarda","M. Campanelli","M. Campbell","F. Canelli","A. Canepa","B. Carls","D. Carlsmith","R. Carosi","S. Carrillo","S. Carron","B. Casal","M. Casarsa","A. Castro","P. Catastini","D. Cauz","V. Cavaliere","M. Cavalli-Sforza","A. Cerri","L. Cerrito","Y. C. Chen","M. Chertok","G. Chiarelli","G. Chlachidze","F. Chlebana","K. Cho","D. Chokheli","J. P. Chou","W. H. Chung","Y. S. Chung","C. I. Ciobanu","M. A. Ciocci","A. Clark","G. Compostella","M. E. Convery","J. Conway","M. Corbo","M. Cordelli","C. A. Cox","D. J. Cox","F. Crescioli","C. Cuenca Almenar","J. Cuevas","R. Culbertson","D. Dagenhart","N. d’Ascenzo","M. Datta","P. de Barbaro","S. De Cecco","G. De Lorenzo","M. Dell’Orso","C. Deluca","L. Demortier","J. Deng","M. Deninno","F. Devoto","M. d’Errico","A. Di Canto","B. Di Ruzza","J. R. Dittmann","M. D’Onofrio","S. Donati","P. Dong","M. Dorigo","T. Dorigo","K. Ebina","A. Elagin","A. Eppig","R. Erbacher","D. Errede","S. Errede","N. Ershaidat","R. Eusebi","H. C. Fang","S. Farrington","M. Feindt","J. P. Fernandez","C. Ferrazza","R. Field","G. Flanagan","R. Forrest","M. J. Frank","M. Franklin","J. C. Freeman","Y. Funakoshi","I. Furic","M. Gallinaro","J. Galyardt","J. E. Garcia","A. F. Garfinkel","P. Garosi","H. Gerberich","E. Gerchtein","S. Giagu","V. Giakoumopoulou","P. Giannetti","K. Gibson","C. M. Ginsburg","N. Giokaris","P. Giromini","M. Giunta","G. Giurgiu","V. Glagolev","D. Glenzinski","M. Gold","D. Goldin","N. Goldschmidt","A. Golossanov","G. Gomez","G. Gomez-Ceballos","M. Goncharov","O. González","I. Gorelov","A. T. Goshaw","K. Goulianos","A. Gresele","S. Grinstein","C. Grosso-Pilcher","R. C. Group","J. Guimaraes da Costa","Z. Gunay-Unalan","C. Haber","S. R. Hahn","E. Halkiadakis","A. Hamaguchi","J. Y. Han","F. Happacher","K. Hara","D. Hare","M. Hare","R. F. Harr","K. Hatakeyama","C. Hays","M. Heck","J. Heinrich","M. Herndon","S. Hewamanage","D. Hidas","A. Hocker","W. Hopkins","D. Horn","S. Hou","R. E. Hughes","M. Hurwitz","U. Husemann","N. Hussain","M. Hussein","J. Huston","G. Introzzi","M. Iori","A. Ivanov","E. James","D. Jang","B. Jayatilaka","E. J. Jeon","M. K. Jha","S. Jindariani","W. Johnson","M. Jones","K. K. Joo","S. Y. Jun","T. R. Junk","T. Kamon","P. E. Karchin","Y. Kato","W. Ketchum","J. Keung","V. Khotilovich","B. Kilminster","D. H. Kim","H. S. Kim","H. W. Kim","J. E. Kim","M. J. Kim","S. B. Kim","S. H. Kim","Y. K. Kim","N. Kimura","M. Kirby","S. Klimenko","K. Kondo","D. J. Kong","J. Konigsberg","A. V. Kotwal","M. Kreps","J. Kroll","D. Krop","N. Krumnack","M. Kruse","V. Krutelyov","T. Kuhr","M. Kurata","S. Kwang","A. T. Laasanen","S. Lami","S. Lammel","M. Lancaster","R. L. Lander","K. Lannon","A. Lath","G. Latino","I. Lazzizzera","T. LeCompte","E. Lee","H. S. Lee","J. S. Lee","S. W. Lee","S. Leo","S. Leone","J. D. Lewis","C.-J. Lin","J. Linacre","M. Lindgren","E. Lipeles","A. Lister","D. O. Litvintsev","C. Liu","Q. Liu","T. Liu","S. Lockwitz","N. S. Lockyer","A. Loginov","D. Lucchesi","J. Lueck","P. Lujan","P. Lukens","G. Lungu","J. Lys","R. Lysak","R. Madrak","K. Maeshima","K. Makhoul","P. Maksimovic","S. Malik","G. Manca","A. Manousakis-Katsikakis","F. Margaroli","C. Marino","M. Martínez","R. Martínez-Ballarín","P. Mastrandrea","M. Mathis","M. E. Mattson","P. Mazzanti","K. S. McFarland","P. McIntyre","R. McNulty","A. Mehta","P. Mehtala","A. Menzione","C. Mesropian","T. Miao","D. Mietlicki","A. Mitra","H. Miyake","S. Moed","N. Moggi","M. N. Mondragon","C. S. Moon","R. Moore","M. J. Morello","J. Morlock","P. Movilla Fernandez","A. Mukherjee","Th. Muller","P. Murat","M. Mussini","J. Nachtman","Y. Nagai","J. Naganoma","I. Nakano","A. Napier","J. Nett","C. Neu","M. S. Neubauer","J. Nielsen","L. Nodulman","O. Norniella","E. Nurse","L. Oakes","S. H. Oh","Y. D. Oh","I. Oksuzian","T. Okusawa","R. Orava","L. Ortolan","S. Pagan Griso","C. Pagliarone","E. Palencia","V. Papadimitriou","A. A. Paramonov","J. Patrick","G. Pauletta","M. Paulini","C. Paus","D. E. Pellett","A. Penzo","T. J. Phillips","G. Piacentino","E. Pianori","J. Pilot","K. Pitts","C. Plager","L. Pondrom","K. Potamianos","O. Poukhov","F. Prokoshin","A. Pronko","F. Ptohos","E. Pueschel","G. Punzi","J. Pursley","A. Rahaman","V. Ramakrishnan","N. Ranjan","I. Redondo","P. Renton","M. Rescigno","F. Rimondi","L. Ristori","A. Robson","T. Rodrigo","T. Rodriguez","E. Rogers","S. Rolli","R. Roser","M. Rossi","F. Rubbo","F. Ruffini","A. Ruiz","J. Russ","V. Rusu","A. Safonov","W. K. Sakumoto","Y. Sakurai","L. Santi","L. Sartori","K. Sato","V. Saveliev","A. Savoy-Navarro","P. Schlabach","A. Schmidt","E. E. Schmidt","M. P. Schmidt","M. Schmitt","T. Schwarz","L. Scodellaro","A. Scribano","F. Scuri","A. Sedov","S. Seidel","Y. Seiya","A. Semenov","F. Sforza","A. Sfyrla","S. Z. Shalhout","T. Shears","P. F. Shepard","M. Shimojima","S. Shiraishi","M. Shochet","I. Shreyber","A. Simonenko","P. Sinervo","A. Sissakian","K. Sliwa","J. R. Smith","F. D. Snider","A. Soha","S. Somalwar","V. Sorin","P. Squillacioti","M. Stancari","M. Stanitzki","R. St. Denis","B. Stelzer","O. Stelzer-Chilton","D. Stentz","J. Strologas","G. L. Strycker","Y. Sudo","A. Sukhanov","I. Suslov","K. Takemasa","Y. Takeuchi","J. Tang","M. Tecchio","P. K. Teng","J. Thom","J. Thome","G. A. Thompson","E. Thomson","P. Ttito-Guzmán","S. Tkaczyk","D. Toback","S. Tokar","K. Tollefson","T. Tomura","D. Tonelli","S. Torre","D. Torretta","P. Totaro","M. Trovato","Y. Tu","F. Ukegawa","S. Uozumi","A. Varganov","F. Vázquez","G. Velev","C. Vellidis","M. Vidal","I. Vila","R. Vilar","J. Vizán","M. Vogel","G. Volpi","P. Wagner","R. L. Wagner","T. Wakisaka","R. Wallny","S. M. Wang","A. Warburton","D. Waters","M. Weinberger","W. C. Wester","B. Whitehouse","D. Whiteson","A. B. Wicklund","E. Wicklund","S. Wilbur","F. Wick","H. H. Williams","J. S. Wilson","P. Wilson","B. L. Winer","P. Wittich","S. Wolbers","H. Wolfe","T. Wright","X. Wu","Z. Wu","K. Yamamoto","J. Yamaoka","T. Yang","U. K. Yang","Y. C. Yang","W.-M. Yao","G. P. Yeh","K. Yi","J. Yoh","K. Yorita","T. Yoshida","G. B. Yu","I. Yu","S. S. Yu","J. C. Yun","A. Zanetti","Y. Zeng","S. Zucchelli"," "],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"d4e24593396ffc0f58078b849c649edcb6924c80"}
+{"doi":"10.1103/physrevlett.79.3455","title":"Gold Nanobridge Stabilized by Surface Structure","authors":["Yukihito Kondo","Kunio Takayanagi"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"3a1a4a5b19be1e13f178ae11159a629b7c128788"}
+{"doi":"10.1103/physrevlett.91.188101","title":"Structure and Dynamics of Annexin 12 Bound to a Planar Lipid Bilayer","authors":["T. Risse","W. L. Hubbell","J. M. Isas","H. T. Haigler"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"e6a0c4b05013d0be043f655ecaf0bd4ceefb6067"}
+{"doi":"10.1103/physrevlett.94.197401","title":"Mechanism for Designing Metallic Metamaterials with a High Index of Refraction","authors":["J. T. Shen","Peter B. Catrysse","Shanhui Fan"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"b8885187ab73a8c595156e210d945cdf2f409498"}
+{"doi":"10.1103/physrevlett.96.233401","title":"Isomorphous Substitution in Bimetallic Oxide Clusters","authors":["E. Janssens","G. Santambrogio","M. Brümmer","L. Wöste","P. Lievens","J. Sauer","G. Meijer","K. R. Asmis"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"9f92901419db2ee4870590c2d14c2521e65ac878"}
+{"doi":"10.1103/physrevlett.97.026601","title":"Quantum Transport of Slow Charge Carriers in Quasicrystals and Correlated Systems","authors":["Guy Trambly de Laissardière","Jean-Pierre Julien","Didier Mayou"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"c9987d533fdb6921848fc2c25b69fdfa8b12fc2b"}
+{"doi":"10.1103/physrevlett.97.048104","title":"Gradient Learning in Spiking Neural Networks by Dynamic Perturbation of Conductances","authors":["Ila R. Fiete","H. Sebastian Seung"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"0f6089fb276a8ab926b735b9043263362bf19985"}
+{"doi":"10.1103/physrevlett.97.151103","title":"Interferometers for Displacement-Noise-Free Gravitational-Wave Detection","authors":["Yanbei Chen","Archana Pai","Kentaro Somiya","Seiji Kawamura","Shuichi Sato","Keiko Kokeyama","Robert L. Ward","Keisuke Goda","Eugeniy E. Mikhailov"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"a3603cc9e5824f8351c91756188905695cd8753e"}
+{"doi":"10.1103/physrevlett.98.030406","title":"Radial and Angular Rotons in Trapped Dipolar Gases","authors":["Shai Ronen","Daniele C. E. Bortolotti","John L. Bohn"],"year":"null","journal":"Physical Review Letters","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"6fbf6f2fe595720ed6d76f1ca3f318bf5b7c9379"}
+{"doi":"10.1103/revmodphys.76.1211","title":"Scaling laws in the distribution of galaxies","authors":["Bernard J. T. Jones","Vicent J. Martínez","Enn Saar","Virginia Trimble"],"year":"null","journal":"Reviews of Modern Physics","publisher":"American Physical Society (APS)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"6f93dac529da3ac314e0e494778593670a8342fd"}
+{"doi":"10.1104/pp.112.195727","title":"Mycorrhizal Networks: Common Goods of Plants Shared under Unequal Terms of Trade","authors":["F. Walder","H. Niemann","M. Natarajan","M. F. Lehmann","T. Boller","A. Wiemken"],"year":"2012","journal":"PLANT PHYSIOLOGY","publisher":"American Society of Plant Biologists (ASPB)","subject":"Plant Science","type":"journal-article","sha":"8c8ac13f8d8cbd11f824ea687cfef138c2bc7f3f"}
+{"doi":"10.1107/s0021889807007637","title":"Extending the charge-flipping method towards structure solution from incomplete data sets","authors":["Lukáš Palatinus","Walter Steurer","Gervais Chapuis"],"year":"2007","journal":"Journal of Applied Crystallography","publisher":"International Union of Crystallography (IUCr)","subject":"Biochemistry, Genetics and Molecular Biology(all)","type":"journal-article","sha":"4b070ffa07b142e827de8901c1dde8a504020ed6"}
+{"doi":"10.1107/s0567740872008027","title":"The crystal and molecular structure of 2,3-diazanaphthalene","authors":["C. Huiszoon","W. B. van der Waal","A. B. van Egmond","S. Harkema"],"year":"1972","journal":"Acta Crystallographica Section B Structural Crystallography and Crystal Chemistry","publisher":"International Union of Crystallography (IUCr)","subject":"Medicine(all)","type":"journal-article","sha":"4596674e9bdad85ae3a4c5672bca158886207cb4"}
+{"doi":"10.1108/00012530710817537","title":"Information Science at City University London","authors":["David Bawden"],"year":"2007","journal":"Aslib Proceedings","publisher":"Emerald","subject":"Library and Information Sciences","type":"journal-article","sha":"6fdbf16f81206100806e457503d4a83558f7d075"}
+{"doi":"10.1108/01437720910948375","title":"Ethnicity and the immigration of highly skilled workers to the United States","authors":["Guillermina Jasso"],"year":"2009","journal":"International Journal of Manpower","publisher":"Emerald","subject":"Management of Technology and Innovation","type":"journal-article","sha":"03684097bbb71bf17fd708f75d3480c168c99108"}
+{"doi":"10.1108/09685229710167980","title":"Managing World Wide Web publications","authors":["Schubert Foo","Ee Peng Lim"],"year":"1997","journal":"Information Management & Computer Security","publisher":"Emerald","subject":"Management Science and Operations Research","type":"journal-article","sha":"1155438832880ac7d7863f1438b758fb7f4d29d7"}
+{"doi":"10.1108/17415650780000327","title":"Navigational indices and content interlinkage on the fly","authors":["Peter Ziewer","Thomas Perst"],"year":"2007","journal":"Interactive Technology and Smart Education","publisher":"Emerald","subject":"","type":"journal-article","sha":"06c880c7317b58b5cbe38b16034bfd11d4330277"}
+{"doi":"10.1108/s0147-9121(2013)0000038001","title":"Did Trade Liberalization Help Women? the Case of Mexico in the 1990s","authors":["Ernesto Aguayo-Tellez","Jim Airola","Chinhui Juhn","Carolina Villegas-Sanchez"],"year":"2013","journal":"Research in Labor Economics","publisher":"Emerald","subject":"","type":"book-chapter","sha":"ae92c936a730d1915d9c254c5c1bc4c1c690d53e"}
+{"doi":"10.1109/25.875236","title":"Achievable performance of dynamic channel assignment schemes under varying reuse constraints","authors":["S. Borst","P. Whiting"],"year":"2000","journal":"IEEE Transactions on Vehicular Technology","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computer Networks and Communications","type":"journal-article","sha":"ddb810d68888ac681ea10ad004dc0c0b551d6ac9"}
+{"doi":"10.1109/2945.765325","title":"Perception of human motion with different geometric models","authors":["J.K. Hodgins","J.F. O'Brien","J. Tumblin"],"year":"1998","journal":"IEEE Transactions on Visualization and Computer Graphics","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Signal Processing","type":"journal-article","sha":"6b2377dfa05150795e3e95099bcf66f772b1cc5e"}
+{"doi":"10.1109/32.895986","title":"The generic consensus service","authors":["R. Guerraoui","A. Schiper"],"year":"2001","journal":"IEEE Transactions on Software Engineering","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Software","type":"journal-article","sha":"552369f8cbf567091c21529663e8f9b51c2951ce"}
+{"doi":"10.1109/36.905250","title":"Amazon floodplain water level changes measured with interferometric SIR-C radar","authors":["D.E. Alsdorf","L.C. Smith","J.M. Melack"],"year":"2001","journal":"IEEE Transactions on Geoscience and Remote Sensing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"88f3324c1b6901ca52f36a063d590df9c38796b0"}
+{"doi":"10.1109/55.728903","title":"Application of plasma immersion ion implantation doping to low-temperature processed poly-Si TFTs","authors":[" Ching-Fa Yeh"," Tai-Ju Chen"," Chung Liu"," Jiqun Shao","N.W. Cheung"],"year":"1998","journal":"IEEE Electron Device Letters","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"d33bf5ea4bceeca74a24118cf6fe360a3f4212aa"}
+{"doi":"10.1109/61.974219","title":"Time-varying harmonics. II. Harmonic summation and propagation","authors":["Y. Baghzouz","R.F. Burch","A. Capasso","A. Cavallini","A.E. Emanuel","M. Halpin","R. Langella","G. Montanari","K.J. Olejniczak","P. Ribeiro","S. Rios-Marcuello","F. Ruggiero","R. Thallam","A. Testa","P. Verde"],"year":"2002","journal":"IEEE Transactions on Power Delivery","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"a5f5c65cd8f066efb06ddd8cb6876258dfb05464"}
+{"doi":"10.1109/6104.980043","title":"Integrated capacitors for conductive lithographic film circuits","authors":["P.M. Harrey","P.S.A. Evans","D.J. Harrison"],"year":"2001","journal":"IEEE Transactions on Electronics Packaging Manufacturing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"acf4eeed48a21ef7f3947317d7109ccff8fdb5e5"}
+{"doi":"10.1109/7.81426","title":"Beamspace ML bearing estimation incorporating low-angle geometry","authors":["M.D. Zoltowski","T.-S. Lee"],"year":"1991","journal":"IEEE Transactions on Aerospace and Electronic Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"de7ab1c82e7a74993dd76e7c3a634133d5a29d5a"}
+{"doi":"10.1109/71.940748","title":"A generalized processor mapping technique for array redistribution","authors":[" Ching-Hsien Hsu"," Yeh-Ching Chung"," Don-Lin Yang"," Chyi-Ren Dow"],"year":"2001","journal":"IEEE Transactions on Parallel and Distributed Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Signal Processing","type":"journal-article","sha":"b90415f866733b633ca6447cc116b2ff7edf7056"}
+{"doi":"10.1109/78.552210","title":"Low complexity optimal joint detection for oversaturated multiple access communications","authors":["R.E. Learned","A.S. Willsky","D.M. Boroson"],"year":"1997","journal":"IEEE Transactions on Signal Processing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Signal Processing","type":"journal-article","sha":"caf7637cb58ce5ecf67e67c1572441bdbfa9bf92"}
+{"doi":"10.1109/8.791946","title":"Complex source radiation in a cylindrical radome of metal-dielectric grating","authors":["A. Altintas","S. Ouardani","V.B. Yurchenko"],"year":"1999","journal":"IEEE Transactions on Antennas and Propagation","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"e628bbec61ab955408c36fd157a6e6b300a8ce0b"}
+{"doi":"10.1109/89.622569","title":"A model of dynamic auditory perception and its application to robust word recognition","authors":["B. Strope","A. Alwan"],"year":"1997","journal":"IEEE Transactions on Speech and Audio Processing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Acoustics and Ultrasonics","type":"journal-article","sha":"f6dfa59215fb07b1646a8799e5183bf36f00cf54"}
+{"doi":"10.1109/aim.2012.6266044","title":"Motion control of electric vehicles based on robust lateral tire force control using lateral tire force sensors","authors":["Kanghyun Nam","Hiroshi Fujimoto","Yoichi Hori"],"year":"2012","journal":"2012 IEEE/ASME International Conference on Advanced Intelligent Mechatronics (AIM)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"622215ad689221a9ad585969d3e33a14ccfe4caa"}
+{"doi":"10.1109/bibe.2013.6701528","title":"EEG epileptic seizure detection using k-means clustering and marginal spectrum based on ensemble empirical mode decomposition","authors":["Paschalis A. Bizopoulos","Dimitrios G. Tsalikakis","Alexandros T. Tzallas","Dimitrios D. Koutsouris","Dimitrios I. Fotiadis"],"year":"2013","journal":"13th IEEE International Conference on BioInformatics and BioEngineering","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"db3273a22cd4a361665a3ccbcde4f0212ad77fce"}
+{"doi":"10.1109/bsc.2006.1644568","title":"Adaptive Demodulation Performance over a Rayleigh Fading Channel","authors":["J.D. Brown","K.N. Plataniotis","S. Pasupathy"],"year":"null","journal":"23rd Biennial Symposium on Communications, 2006","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"4e9ddb0f161fbe6e4c111d2bda6b444ecdc2bb23"}
+{"doi":"10.1109/cdc.1990.203676","title":"Stable policies for Petri-nets with fluctuating transition processes","authors":["C. Courcoubetis","R. Weber"],"year":"1990","journal":"29th IEEE Conference on Decision and Control","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"a399346ed95116a34d2c894235a2e6df10fcd1d3"}
+{"doi":"10.1109/cdc.2011.6160712","title":"Nonlinear control of PVTOL vehicles subjected to drag and lift","authors":["Daniele Pucci","Tarek Hamel","Pascal Morin","Claude Samson"],"year":"2011","journal":"IEEE Conference on Decision and Control and European Control Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"53d8cff786a0b02a6ff86c11cd5512fbf538eacd"}
+{"doi":"10.1109/cec.2005.1554945","title":"XCS with Computed Prediction in Continuous Multistep Environments","authors":["P.L. Lanzi","D. Loiacono","S.W. Wilson","D.E. Goldberg"],"year":"null","journal":"2005 IEEE Congress on Evolutionary Computation","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"10e7f01df21f05034b4871f1dcf8bb9d83408864"}
+{"doi":"10.1109/cic.2003.1291234","title":"Quantitative 3D assessment of myocardial viability with MRI delayed contrast enhancement","authors":["V. Positano","M.F. Santarelli","A. Pingitore","M. Lombardi","L. Landini","A. Benassi"],"year":"2003","journal":"Computers in Cardiology, 2003","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"d9d9cc9113148ec4f423a29d5f05231ae39f0c7c"}
+{"doi":"10.1109/cic.2007.4745551","title":"Robust prediction of atrial fibrillation termination usingwavelet bidomain entropy analysis","authors":["R. Alcaraz","J.J. Rieta"],"year":"2007","journal":"2007 Computers in Cardiology","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"799ff2d72f638075b3535b9b54310b8899b67210"}
+{"doi":"10.1109/cicc.2000.852719","title":"Low power bus coding techniques considering inter-wire capacitances","authors":["P.P. Sotiriadis","A. Chandrakasan"],"year":"null","journal":"Proceedings of the IEEE 2000 Custom Integrated Circuits Conference (Cat. No.00CH37044)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"58a818d85d618946b08f2a11733dfa58985c41b2"}
+{"doi":"10.1109/coec.2003.1210239","title":"Contract representation for run-time monitoring and enforcement","authors":["C. Molina-Jimenez","S. Shrivastava","E. Solaiman","J. Warne"],"year":"null","journal":"IEEE International Conference on E-Commerce, 2003. CEC 2003.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"610bc32e1f27ff489100d9f59e8a98759ec122b7"}
+{"doi":"10.1109/cvpr.2004.423","title":"Preface to Workshop on Real-Time Vision for Human-Computer Interaction","authors":["B. Kisacanin","V. Pavlovic","T.S. Huang"],"year":"null","journal":"2004 Conference on Computer Vision and Pattern Recognition Workshop","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"ad3e2fe6747bee580ae4c71af07ebc975c3cd041"}
+{"doi":"10.1109/delta.2004.10008","title":"Towards Analog and Mixed-Signal SOC Design with SystemC-AMS","authors":["A. Vachoux","C. Grimm","K. Einwich"],"year":"null","journal":"Second IEEE International Workshop on Electronic Design, Test and Applications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"e80282c38bb0cf8665ef0d1819921de6ae0a1a33"}
+{"doi":"10.1109/ecc.2015.7330765","title":"Directional real-time optimization applied to a kite-control simulation benchmark","authors":["Sean Costello","Gregory Francois","Dominique Bonvin"],"year":"2015","journal":"2015 European Control Conference (ECC)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"e0a3575b5e409c9af80b34a70d55c09d91527286"}
+{"doi":"10.1109/embc.2012.6346431","title":"Spectral clustering of shape and probability prior models for automatic prostate segmentation","authors":["S. Ghose","J. Mitra","A. Oliver","R. Marti","X. Llado","J. Freixenet","J. C. Vilanova","J. Comet","D. Sidibe","F. Meriaudeau"],"year":"2012","journal":"2012 Annual International Conference of the IEEE Engineering in Medicine and Biology Society","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"393fba8e909430ca84977b0890c932883f27c978"}
+{"doi":"10.1109/est.2012.11","title":"On the Helmholtz Principle for Data Mining","authors":["Boris Dadachev","Alexander Balinsky","Helen Balinsky","Steven Simske"],"year":"2012","journal":"2012 Third International Conference on Emerging Security Technologies","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"2f298b56b79c50e6a86e7861138ae84a2256ad96"}
+{"doi":"10.1109/fccm.2009.41","title":"Application Specific Customization and Scalability of Soft Multiprocessors","authors":["Deepak Unnikrishnan","Jia Zhao","Russell Tessier"],"year":"2009","journal":"2009 17th IEEE Symposium on Field Programmable Custom Computing Machines","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"f11a221bbf3b4858fd08725827ad89b3ab65517a"}
+{"doi":"10.1109/fpt.2003.1275800","title":"FPGA implementable architecture for geometric global positioning","authors":["A. Utgikar","G. Seetharaman"],"year":"null","journal":"Proceedings. 2003 IEEE International Conference on Field-Programmable Technology (FPT) (IEEE Cat. No.03EX798)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"a578193e7aa33fdabf221061c68025844bde0599"}
+{"doi":"10.1109/glocom.1993.318140","title":"Throughput performance of transport-layer protocols over wireless LANs","authors":["A. DeSimone"," Mooi Choo Chuah"," On-Ching Yue"],"year":"null","journal":"Proceedings of GLOBECOM '93. IEEE Global Telecommunications Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"0168e796c1d46e8ac24994653cefe33234d0c3b2"}
+{"doi":"10.1109/glocom.1998.775780","title":"TCP performance improvement over wireless ATM networks through a new AAL protocol","authors":["I.F. Akyildiz","I. Joe"],"year":"null","journal":"IEEE GLOBECOM 1998 (Cat. NO. 98CH36250)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"3b2a0f364ef96e4e6e83cbdb43157c09dfff47fd"}
+{"doi":"10.1109/glocom.2000.891298","title":"Semi-blind block channel estimation and signal detection using hidden Markov models","authors":["P. Chen","H. Kobayashi"],"year":"null","journal":"Globecom '00 - IEEE. Global Telecommunications Conference. Conference Record (Cat. No.00CH37137)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"0a0d7a95553a1c762383c896567c5021ed705891"}
+{"doi":"10.1109/glocom.2007.602","title":"On Multi-User Gain in MIMO Systems with Rate Constraints","authors":["Peng Wang","Li Ping"],"year":"2007","journal":"IEEE GLOBECOM 2007-2007 IEEE Global Telecommunications Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"bc67b9b966fc6fab3aca2cf096386e15afad28ff"}
+{"doi":"10.1109/group4.2012.6324090","title":"1.53&#x00B5;m electroluminescence of erbium excited by hot carriers in ErRE (RE=Yb, Y) silicates","authors":["Bing Wang","Ruimin Guo","Lei Wang","Xingjun Wang","Zhiping Zhou"],"year":"2012","journal":"The 9th International Conference on Group IV Photonics (GFP)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"db2fee1e448bf80b00d8328b16d24ab6f8842f48"}
+{"doi":"10.1109/hicss.1998.649228","title":"Conceptual design of data warehouses from E/R schemes","authors":["M. Golfarelli","D. Maio","S. Rizzi"],"year":"null","journal":"Proceedings of the Thirty-First Hawaii International Conference on System Sciences","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"345109f4668e8e5aed37606fee9f23ea6c30cc65"}
+{"doi":"10.1109/icassp.2005.1415690","title":"Implementation of Finite Difference Schemes for the Wave Equation on FPGA","authors":["H.E. Motuk","R. Woods","S. Bilbao"],"year":"null","journal":"Proceedings. (ICASSP '05). IEEE International Conference on Acoustics, Speech, and Signal Processing, 2005.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"f24ce5f580ee287e8a2b69f7a1f9899b052a5f9b"}
+{"doi":"10.1109/icassp.2014.6854897","title":"Hardware and algorithms for ultrasonic depth imaging","authors":["Ivan Dokmanic","Ivan Tashev"],"year":"2014","journal":"2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"55f4b11b7c07b6f44c2ce30e7b20e0ee450e3d45"}
+{"doi":"10.1109/icc.2003.1204592","title":"Packet scheduling scheme for a 3-stage Clos-network photonic switch","authors":["H.J. Chao"," Zhigang Jing"," Kung-Li Deng"],"year":"null","journal":"IEEE International Conference on Communications, 2003. ICC '03.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"994fd2707774397a3c1fde6e7ac3d9661ce83c61"}
+{"doi":"10.1109/icc.2006.254927","title":"Optimum Linear Transmitter Design for MIMO Systems with Two QPSK Data Streams","authors":["Miquel Payaro","Antonio Pascual-Iserte","Miguel Lagunas"],"year":"2006","journal":"2006 IEEE International Conference on Communications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"5593fe4e2eba0542372a1a998f59aa59c85a8dc5"}
+{"doi":"10.1109/icc.2012.6364512","title":"Optimal distributed coding schemes for energy efficiency in the fading relay channel","authors":["Fanny Parzysz","Mai H. Vu","Francois Gagnon"],"year":"2012","journal":"2012 IEEE International Conference on Communications (ICC)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"47af4052e284b48819ab263eaddaed41ed3315a8"}
+{"doi":"10.1109/iccis.2004.1460482","title":"Tabu - local search mechanism for Mega Process genetic algorithm","authors":["Y. Hanada","T. Hiroyasu","M. Miki"],"year":"null","journal":"IEEE Conference on Cybernetics and Intelligent Systems, 2004.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"ed310b267053e3007566ebea5417c4c6c3c40e2e"}
+{"doi":"10.1109/icde.2000.839439","title":"Answering regular path queries using views","authors":["D. Calvanese","G. De Giacomo","M. Lenzerini","M.Y. Vardi"],"year":"null","journal":"Proceedings of 16th International Conference on Data Engineering (Cat. No.00CB37073)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"2f5b3b4d8523087a77cbb471e9a5c340e9e1fa6e"}
+{"doi":"10.1109/icec.1995.489200","title":"An application of genetic algorithms to evolve Hopfield type optimum network architectures for object extraction","authors":[" Susmita De","A. Ghosh","S.K. Pal"],"year":"null","journal":"Proceedings of 1995 IEEE International Conference on Evolutionary Computation","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"9b314a7bd7fae6a97e76d938fd778581ba444525"}
+{"doi":"10.1109/icics.2003.1292707","title":"Correlative coding with clipping and filtering technique in OFDM systems","authors":["S.K. Yusof","N. Fisal"],"year":"null","journal":"Fourth International Conference on Information, Communications and Signal Processing, 2003 and the Fourth Pacific Rim Conference on Multimedia. Proceedings of the 2003 Joint","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"3262fa93df27937cec4ea9606dd6b7b63d51a111"}
+{"doi":"10.1109/icip.2005.1530180","title":"A weight-adaptive dynamic model for shape segmentation","authors":["K.D. Toennies","P. Benedix"],"year":"2005","journal":"IEEE International Conference on Image Processing 2005","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"190ced1171d057dc936099d0570276900d7192cf"}
+{"doi":"10.1109/icmb.2006.39","title":"Personal Heart Monitoring and Rehabilitation System using Smart Phones","authors":["Peter Leijdekkers","Valerie Gay"],"year":"2006","journal":"2006 International Conference on Mobile Business","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"7707c1b331f23145b821aa78b779aa31346e833c"}
+{"doi":"10.1109/icme.2004.1394184","title":"Accessing video archives using interactive search","authors":["M. Worring","G.P. Nguyen","L. Hollink","J.C. van Gemert","D.C. Koelma"],"year":"null","journal":"2004 IEEE International Conference on Multimedia and Expo (ICME) (IEEE Cat. No.04TH8763)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"ce71a60c44a5343cd80c3695a6ac5f4d72210cef"}
+{"doi":"10.1109/icmla.2014.76","title":"A Comparison of Supervised Machine Learning Techniques for Predicting Short-Term In-Hospital Length of Stay among Diabetic Patients","authors":["April Morton","Eman Marzban","Georgios Giannoulis","Ayush Patel","Rajender Aparasu","Ioannis A. Kakadiaris"],"year":"2014","journal":"2014 13th International Conference on Machine Learning and Applications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"3614e6b9313b0093bda33b719c28708637237323"}
+{"doi":"10.1109/icmts.2005.1452273","title":"Charge pumping at radio frequencies [MOSFET device interface state density measurement]","authors":["G.T. Sasse","H. de Vries","J. Schmitz"],"year":"null","journal":"Proceedings of the 2005 International Conference on Microelectronic Test Structures, 2005. ICMTS 2005.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"f004fa80273d40fdf78c0c24567f19092b86e574"}
+{"doi":"10.1109/icns.2006.74","title":"Mobile Ad Hoc Networking Approach to Detecting and Querying Events Related to Farm Animals","authors":["M. Radenkovic","B. Wietrzyk"],"year":"2006","journal":"International conference on Networking and Services (ICNS'06)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"c5b6edd4b31e2bcfdc625557d2a33212698084c2"}
+{"doi":"10.1109/icoin.2001.905638","title":"Ad hoc on-demand backup node setup routing protocol","authors":["C.M. Chung"," Ying-Hong Wang"," Chih-Chieh Chuang"],"year":"null","journal":"Proceedings 15th International Conference on Information Networking","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"9052c89793990fb392b0e037043f8fa686067af3"}
+{"doi":"10.1109/icpp.2009.21","title":"Perfomance Models for Blocked Sparse Matrix-Vector Multiplication Kernels","authors":["Vasileios Karakasis","Georgios Goumas","Nectarios Koziris"],"year":"2009","journal":"2009 International Conference on Parallel Processing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"e80cd796e3e26296ab081e15a4a83cd81b952917"}
+{"doi":"10.1109/icpr.2010.239","title":"A Discrete Labelling Approach to Attributed Graph Matching Using SIFT Features","authors":["Gerard Sanroma","Rene Alquezar","Francesc Serratosa"],"year":"2010","journal":"2010 20th International Conference on Pattern Recognition","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"c8d84fc463d76653494382273577e0fc3b856591"}
+{"doi":"10.1109/icra.2012.6224931","title":"Supervised learning of hidden and non-hidden 0-order affordances and detection in real scenes","authors":["Aitor Aldoma","Federico Tombari","Markus Vincze"],"year":"2012","journal":"2012 IEEE International Conference on Robotics and Automation","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"60d304023269254fe2c96a8bac15ea47777bb679"}
+{"doi":"10.1109/iedm.1998.746345","title":"A new dynamic model for the kink effect in InAlAs/InGaAs HEMTs","authors":["M.H. Somerville","A. Ernst","J.A. del Alamo"],"year":"null","journal":"International Electron Devices Meeting 1998. Technical Digest (Cat. No.98CH36217)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"975e033890ab9752b51e6c166263e448fe99da25"}
+{"doi":"10.1109/igarss.2000.861734","title":"Performance and processing of SAR satellite clusters","authors":["J. Stiles","N. Goodman"," SiChung Lin"],"year":"null","journal":"IGARSS 2000. IEEE 2000 International Geoscience and Remote Sensing Symposium. Taking the Pulse of the Planet: The Role of Remote Sensing in Managing the Environment. Proceedings (Cat. No.00CH37120)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"5b4986a6b8cb315bee08d4c2a206e2ef18e14be5"}
+{"doi":"10.1109/imoc.2009.5427492","title":"Per-band link control transients protection in distributed fiber Raman amplifier cascades","authors":["Victor Pincheira","Marcelo A. Soto","Ricardo Olivares"],"year":"2009","journal":"2009 SBMO/IEEE MTT-S International Microwave and Optoelectronics Conference (IMOC)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"4e86572fe35d91529d210b7878de2a90a2c45a26"}
+{"doi":"10.1109/infcom.2007.238","title":"A Performance Study of Deployment Factors in Wireless Mesh Networks","authors":["J. Robinson","E. W. Knightly"],"year":"2007","journal":"IEEE INFOCOM 2007 - 26th IEEE International Conference on Computer Communications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"9f50c9285a1f484bf724964ed38c15712d02d4f1"}
+{"doi":"10.1109/infocom.2014.6848075","title":"Transductive malware label propagation: Find your lineage from your neighbors","authors":["Deguang Kong","Guanhua Yan"],"year":"2014","journal":"IEEE INFOCOM 2014 - IEEE Conference on Computer Communications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"5021090a00667faffd8d2816177d59fe7650c013"}
+{"doi":"10.1109/ipdps.2000.845969","title":"Parallel performance study of Monte Carlo photon transport code on shared-, distributed-, and distributed-shared-memory architectures","authors":["A. Majumdar"],"year":"null","journal":"Proceedings 14th International Parallel and Distributed Processing Symposium. IPDPS 2000","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"065036bd75768c0e9f0f31e4527c2ae4287c3eb8"}
+{"doi":"10.1109/ipdps.2004.1303343","title":"Visual formal specification using (N)TLcharts: statechart automata with temporal logic and natural language conditioned transitions","authors":["D. Drusinsky"],"year":"null","journal":"18th International Parallel and Distributed Processing Symposium, 2004. Proceedings.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"513ed364e8b2e27b6ede8acc6aaa6e685e04cc6c"}
+{"doi":"10.1109/ipdps.2007.370341","title":"Using Speed Diagrams for Symbolic Quality Management","authors":["Jacques Combaz","Jean-Claude Fernandez","Joseph Sifakis","Loic Strus"],"year":"2007","journal":"2007 IEEE International Parallel and Distributed Processing Symposium","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"87b6c263c287803c88b55ea8511b937d78443ed9"}
+{"doi":"10.1109/isit.2008.4594986","title":"Capacity bounds for the Gaussian Interference Channel","authors":["Abolfazl S. Motahari","Amir K. Khandani"],"year":"2008","journal":"2008 IEEE International Symposium on Information Theory","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"c120df9b9f8baf66d60ee6ea7665ff0265dac2a1"}
+{"doi":"10.1109/isms.2014.76","title":"Model Based On-Line Energy Prediction System for Semi-autonomous Mobile Robots","authors":["Ramviyas Parasuraman","Keith Kershaw","Prithvi Pagala","Manuel Ferre"],"year":"2014","journal":"2014 5th International Conference on Intelligent Systems, Modelling and Simulation","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"968e14596309da395dd292e461388812a391aaf3"}
+{"doi":"10.1109/isorc.2000.839549","title":"Agent-oriented material flow control system based on DCOM","authors":["R. Schoop","R. Neubert"],"year":"null","journal":"Proceedings Third IEEE International Symposium on Object-Oriented Real-Time Distributed Computing (ISORC 2000) (Cat. No. PR00607)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"90ea42b46f9b4740e291a5e95ff75ce2c3fc42e3"}
+{"doi":"10.1109/jsac-ocn.2006.22605","title":"An improved algorithm for optimal lightpath establishment on a tree topology","authors":[" Guoliang Xue"," Weiyi Zhang"," Jian Tang","K. Thulasiraman"],"year":"2006","journal":"IEEE Journal on Selected Areas in Communications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computer Networks and Communications","type":"journal-article","sha":"ff55b60fee69c2dca318934080e4d9446fd3020b"}
+{"doi":"10.1109/jssc.2003.822890","title":"A New DLL-Based Approach for All-Digital Multiphase Clock Generation","authors":["C.-C. Chung","C.-Y. Lee"],"year":"2004","journal":"IEEE Journal of Solid-State Circuits","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"f85e251669faba289436ed344ebbf897ddf85f3a"}
+{"doi":"10.1109/lpe.2001.945411","title":"Effects of elevated temperature on tunable near-zero threshold CMOS","authors":["V. Svilan","J.B. Burr","G.L. Tyler"],"year":"null","journal":"ISLPED'01: Proceedings of the 2001 International Symposium on Low Power Electronics and Design (IEEE Cat. No.01TH8581)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"01facce80940a30b7eae2e52d483b2849ab9e8b1"}
+{"doi":"10.1109/lpe.2003.1231870","title":"Ambient intelligence - industrial research on a visionary concept","authors":["W. Weber"],"year":"null","journal":"Proceedings of the 2003 International Symposium on Low Power Electronics and Design, 2003. ISLPED '03.","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"db1c87c022f9abe4f374a8e1e9af951c9b111608"}
+{"doi":"10.1109/map.1995.475876","title":"Database of \"In Vivo\" Measurements for Quantitative Microwave Imaging and Reconstruction Algorithms","authors":["J.J. Mallorqui","N. Joachimowicz","J.Ch. Bolomey","A. Broquetas"],"year":"1995","journal":"IEEE Antennas and Propagation Magazine","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"a7968e86589676188d45ddac1bf84b96e8dd2b78"}
+{"doi":"10.1109/mass.2013.61","title":"Resource Allocation with Non-deterministic Demands and Profits","authors":["Nan Hu","Diego Pizzocaro","Matthew P. Johnson","Thomas Laporta","Alun D. Preece"],"year":"2013","journal":"2013 IEEE 10th International Conference on Mobile Ad-Hoc and Sensor Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"59a64bc4e20ea824471f152c509b1851525d8d59"}
+{"doi":"10.1109/mc.2003.1244536","title":"Research feature - SCTP: a proposed standard for robust internet data transport","authors":["A.L. Caro","J.R. Iyengar","P.D. Amer","S. Ladha","G.J. Heinz","K.C. Shah"],"year":"2003","journal":"Computer","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computer Science(all)","type":"journal-article","sha":"9a140ce01c1f4404a9437fb720ba1c2fe4f162d1"}
+{"doi":"10.1109/mcom.2012.6231279","title":"Caching and mobility support in a publish-subscribe internet architecture","authors":["G. Xylomenos","X. Vasilakos","C. Tsilopoulos","V. A. Siris","G. C. Polyzos"],"year":"2012","journal":"IEEE Communications Magazine","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computer Networks and Communications","type":"journal-article","sha":"0800f70e58a458fc660baf733aa1dcbb5f516e24"}
+{"doi":"10.1109/mwscas.2006.382032","title":"An Energy-Efficient Differential Flip-Flop for Deeply Pipelined Systems","authors":["Mitchell J. Myjak","Jose G. Delgado-Frias","Seon Kwang Jeon"],"year":"2006","journal":"2006 49th IEEE International Midwest Symposium on Circuits and Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"ea772b62de1d1e01cbf8778b2be57482d27fe591"}
+{"doi":"10.1109/nems.2013.6559681","title":"Manipulation of DNA origami nanotubes in liquid using a programmable tapping mode AFM","authors":[" Longhai Li"," Xiaojun Tian","Zaili Dong"," Lianqing Liu","Osamu Tabata","Wen J. Li"],"year":"2013","journal":"The 8th Annual IEEE International Conference on Nano/Micro Engineered and Molecular Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"b27b7df220089b2f9666025071127409b4149c03"}
+{"doi":"10.1109/p2p.2005.34","title":"Self-Stabilizing Structured Ring Topology P2P Systems","authors":["A. Shaker","D.S. Reeves"],"year":"null","journal":"Fifth IEEE International Conference on Peer-to-Peer Computing (P2P'05)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"4216191d4d5551a43228c60afaa79c62f1b07850"}
+{"doi":"10.1109/pac.1995.505847","title":"Transport of bunched beams with space charge through a periodic lattice","authors":["M.F. Reusch","D.L. Bruhwiler"],"year":"null","journal":"Proceedings Particle Accelerator Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"42adebc7bc21eb93e6e7df07e27008d9d9226a2c"}
+{"doi":"10.1109/pdse.1999.779744","title":"Evaluation of a methodology for the reverse engineering and parallelization of sequential code","authors":[" Andersen"," Pizzi"," Zhu"," Cao"," Bagert"," Antonio"," Lott"," Grieger"],"year":"1999","journal":"Proceedings International Symposium on Software Engineering for Parallel and Distributed Systems PDSE-99","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"466390872d14ccb22865be037558e46a9ca72293"}
+{"doi":"10.1109/pesc.2007.4342054","title":"Single-chip FPGA Implementation of a Sensorless Speed Control IC for Permanent Magnet Synchronous Motors","authors":["Yen-Chuan Chang","Ying-Yu Tzou"],"year":"2007","journal":"2007 IEEE Power Electronics Specialists Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"e067e437b60feeaf4b38339988a911e58f5b5d02"}
+{"doi":"10.1109/sahcnw.2008.16","title":"On Redundant Multipath Operating System Support for Wireless Mesh Networks","authors":["Yair Amir","Claudiu Danilov","Michael A. Kaplan","Raluca Musaloiu-Elefteri","Nilo Rivera"],"year":"2008","journal":"2008 5th IEEE Annual Communications Society Conference on Sensor, Mesh and Ad Hoc Communications and Networks Workshops","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"6d30758efb1dddbd0780e4dac60e0fa7cdd010bb"}
+{"doi":"10.1109/spi.2004.1409052","title":"Layout synthesis algorithm of embedded passive components for RF and EMC reliable system design","authors":["G. Sommer","W. John","H. Reichl"],"year":"null","journal":"Proceedings. 45th Annual IEEE Symposium on Foundations of Computer Science","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"85a1869dcb018ff286d520b3b110fbac48be2604"}
+{"doi":"10.1109/srds.2007.4365692","title":"A Language-Based Approach for Improving the Robustness of Network Application Protocol Implementations","authors":["Laurent Burgy","Laurent Reveillere","Julia L. Lawall","Gilles Muller"],"year":"2007","journal":"2007 26th IEEE International Symposium on Reliable Distributed Systems (SRDS 2007)","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"1ee93e4ae97c19cd14a024a99e0bfcafaa937320"}
+{"doi":"10.1109/ssp.2003.1289460","title":"MCMC-based peak template matching for GCXGC","authors":[" Mingtian Ni"," Qingping Tao","S.E. Reichenbach"],"year":"null","journal":"IEEE Workshop on Statistical Signal Processing, 2003","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"315b26e4648f3daa1c455959efeb0cf25eabc4b0"}
+{"doi":"10.1109/tasc.2007.899063","title":"Strongly Coupled Artificial Bulk HTS Grain Boundaries With High Critical Current Densities","authors":["N.H. Babu","T.D. Withnell","K. Iida","D.A. Cardwell"],"year":"2007","journal":"IEEE Transactions on Applied Superconductivity","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"01072d300e511ca28ca45dd52c9b6b80c81d0cbd"}
+{"doi":"10.1109/tcad.2002.1013896","title":"DS-LFSR: a BIST TPG for low switching activity","authors":["S. Wang","S.K. Gupta"],"year":"2002","journal":"IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"168b3167d58cbc2a9311d36a2013cac643990819"}
+{"doi":"10.1109/tcsii.2006.882206","title":"Leakage Biased pMOS Sleep Switch Dynamic Circuits","authors":["Z. Liu","V. Kursun"],"year":"2006","journal":"IEEE Transactions on Circuits and Systems II: Express Briefs","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Signal Processing","type":"journal-article","sha":"1a2c05a969beb61f32e5824d9fd916c68dc50ea8"}
+{"doi":"10.1109/ted.2004.839740","title":"The DNA SET: a novel device for single-molecule DNA sequencing","authors":["P. Mali","R.K. Lal"],"year":"2004","journal":"IEEE Transactions on Electron Devices","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"9859deb2b021db124f8c61ec3cb7b1ab52bfdb4d"}
+{"doi":"10.1109/ted.2006.887198","title":"Performance and Reliability of Strained-Silicon nMOSFETs With SiN Cap Layer","authors":["Gino Giusi","Felice Crupi","Eddy Simoen","Geert Eneman","Malgorzata Jurczak"],"year":"2007","journal":"IEEE Transactions on Electron Devices","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"0cdec1bf14c99dd1f782c382150f4659f1524922"}
+{"doi":"10.1109/test.2004.1386949","title":"Interconnect test pattern generation algorithm for meeting device and global SSO limits with safe initial vectors","authors":["K. Baker","M. Nourani"],"year":"null","journal":"2004 International Conferce on Test","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"02e6fba4eed66f04a92930a9a57fb2687e1a237b"}
+{"doi":"10.1109/tit.1984.1056869","title":"An algorithm for maximizing expected log investment return","authors":["T. Cover"],"year":"1984","journal":"IEEE Transactions on Information Theory","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Library and Information Sciences","type":"journal-article","sha":"f51017714534dc649c8b53c90c80aef4a3481d32"}
+{"doi":"10.1109/tit.2005.847751","title":"Nonlinear Analysis of the Iterative Decoding of Parallel Concatenated Convolutional Codes","authors":["F. Lehmann","G.M. Maggio"],"year":"2005","journal":"IEEE Transactions on Information Theory","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Library and Information Sciences","type":"journal-article","sha":"7dd8e2ca8f4f537c9b3dd6288e8ea694b18d2098"}
+{"doi":"10.1109/tit.2005.851744","title":"Insufficiency of Linear Coding in Network Information Flow","authors":["R. Dougherty","C. Freiling","K. Zeger"],"year":"2005","journal":"IEEE Transactions on Information Theory","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Library and Information Sciences","type":"journal-article","sha":"2c80ca4786e43d4af0516e43662187c4652c048b"}
+{"doi":"10.1109/tit.2006.874534","title":"Separating distributed source coding from network coding","authors":["A. Ramamoorthy","K. Jain","P.A. Chou","M. Effros"],"year":"2006","journal":"IEEE Transactions on Information Theory","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Library and Information Sciences","type":"journal-article","sha":"c84dff7d1b336173b2c320ee7f592c59c9a3413a"}
+{"doi":"10.1109/tit.2008.2011427","title":"Self-Organization Properties of CSMA/CA Systems and Their Consequences on Fairness","authors":["Mathilde Durvy","Olivier Dousse","Patrick Thiran"],"year":"2009","journal":"IEEE Transactions on Information Theory","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Library and Information Sciences","type":"journal-article","sha":"1198ffbc0ff68352ba52f37623f312f745562822"}
+{"doi":"10.1109/tmag.2003.815747","title":"Micromagnetic simulation of the pinning and depinning process in permanent magnets","authors":["W. Scholz","T. Schrefl","J. Fidler","T. Matthias","D. Suess","V. Tsiantos"],"year":"2003","journal":"IEEE Transactions on Magnetics","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"e566c063e98077f36efc2e0be7c6c3f9b57e87d0"}
+{"doi":"10.1109/tnn.2002.1031947","title":"Web mining in soft computing framework: relevance, state of the art and future directions","authors":["S.K. Pal","V. Talwar","P. Mitra"],"year":"2002","journal":"IEEE Transactions on Neural Networks","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computer Networks and Communications","type":"journal-article","sha":"9edaf8342f509de3013964b58fdecb6a3163db8a"}
+{"doi":"10.1109/tpami.2008.46","title":"Full-Search-Equivalent Pattern Matching with Incremental Dissimilarity Approximations","authors":["F. Tombari","S. Mattoccia","L. Di Stefano"],"year":"2009","journal":"IEEE Transactions on Pattern Analysis and Machine Intelligence","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computational Theory and Mathematics","type":"journal-article","sha":"92488ad800943b4e0488331e468120b3e41baa6d"}
+{"doi":"10.1109/tpami.2014.2346173","title":"Dense Subgraph Partition of Positive Hypergraphs","authors":["Hairong Liu","Longin Jan Latecki","Shuicheng Yan"],"year":"2015","journal":"IEEE Transactions on Pattern Analysis and Machine Intelligence","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computational Theory and Mathematics","type":"journal-article","sha":"cfe5f4fe4bd428aaed47cba12ef8fbd8617692b9"}
+{"doi":"10.1109/tpds.2011.171","title":"Optimally Maximizing Iteration-Level Loop Parallelism","authors":["Duo Liu","Yi Wang","Zili Shao","Minyi Guo","Jingling Xue"],"year":"2012","journal":"IEEE Transactions on Parallel and Distributed Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Signal Processing","type":"journal-article","sha":"a1964dbe96c70a3c7b47a508eea4bd8121df8915"}
+{"doi":"10.1109/tpwrs.2003.811197","title":"Optimal energy management of an industrial consumer in liberalized markets","authors":["E. Gomez-Villalva","A. Ramos"],"year":"2003","journal":"IEEE Transactions on Power Systems","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"76842dbdd830902efb365b4e8afb6b2fee8323b6"}
+{"doi":"10.1109/tsc.2009.4","title":"A Reference Architecture for Scientific Workflow Management Systems and the VIEW SOA Solution","authors":[" Cui Lin"," Shiyong Lu"," Xubo Fei","A. Chebotko"," Darshan Pai"," Zhaoqiang Lai","F. Fotouhi"," Jing Hua"],"year":"2009","journal":"IEEE Transactions on Services Computing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Computer Networks and Communications","type":"journal-article","sha":"ce60c5ee80f26ea0d18a292d9ebaf7b4bc27b152"}
+{"doi":"10.1109/tsp.2005.850343","title":"Robust iterative fitting of multilinear models","authors":["S.A. Vorobyov"," Yue Rong","N.D. Sidiropoulos","A.B. Gershman"],"year":"2005","journal":"IEEE Transactions on Signal Processing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Signal Processing","type":"journal-article","sha":"72d83fd37a0fadea9b3b6d5dc8475f7e3516ef09"}
+{"doi":"10.1109/twc.2013.013013.121689","title":"Heterogeneous Relay Selection","authors":["Mohamed Abouelseoud","Aria Nosratinia"],"year":"2013","journal":"IEEE Transactions on Wireless Communications","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"7dd2e1694977fb0002610eb51c9660ca082e7e94"}
+{"doi":"10.1109/vast.2010.5652460","title":"Flow-based scatterplots for sensitivity analysis","authors":["Yu-Hsuan Chan","Carlos D. Correa","Kwan-Liu Ma"],"year":"2010","journal":"2010 IEEE Symposium on Visual Analytics Science and Technology","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"40a48c89bf9c5b672b335a48a3276a491804cf26"}
+{"doi":"10.1109/vlhcc.2004.8","title":"An Interface Design Methodology: Scenario Based Design Extended for Diverse Computer User Groups","authors":["K.T. Johnson"],"year":"null","journal":"2004 IEEE Symposium on Visual Languages - Human Centric Computing","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"e346940f6515e60f36fc7d08de757300bbccad61"}
+{"doi":"10.1109/wcnc.2011.5779357","title":"Non-regenerative full distributed space-time codes in cooperative relaying networks","authors":["Le Quang Vinh Tran","Olivier Berder","Olivier Sentieys"],"year":"2011","journal":"2011 IEEE Wireless Communications and Networking Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"74cba9ac1bd67ce33459ee1c9bd9303db182adac"}
+{"doi":"10.1109/wsc.2008.4736403","title":"A simulation model to analyze the impact of hole size on putting in golf","authors":["Matulya Bansal","Mark Broadie"],"year":"2008","journal":"2008 Winter Simulation Conference","publisher":"Institute of Electrical and Electronics Engineers (IEEE)","subject":"","type":"proceedings-article","sha":"9071b70f43757a38d626c560da99eb23229ac266"}
+{"doi":"10.1111/1467-9892.00271","title":"Bayesian analysis of switching ARCH models","authors":["SYLVIA KAUFMANN","SYLVIA FRUHWIRTH-SCHNATTER"],"year":"2002","journal":"Journal of Time Series Analysis","publisher":"Wiley-Blackwell","subject":"Statistics, Probability and Uncertainty","type":"journal-article","sha":"a055651331084dd2688846279a1075a12ef39f79"}
+{"doi":"10.1111/1468-232x.00299","title":"State Prevailing Wage Laws and School Construction Costs","authors":["Hamid Azari-Rad","Peter Philips","Mark J. Prus"],"year":"2003","journal":"Industrial Relations","publisher":"Wiley-Blackwell","subject":"Management of Technology and Innovation","type":"journal-article","sha":"232260eeb59aaecbf045a45fcf6b50aba450ee1e"}
+{"doi":"10.1111/gcb.12802","title":"How inhibiting nitrification affects nitrogen cycle and reduces environmental impacts of anthropogenic nitrogen input","authors":["Chunlian Qiao","Lingli Liu","Shuijin Hu","Jana E. Compton","Tara L. Greaver","Quanlin Li"],"year":"2015","journal":"Global Change Biology","publisher":"Wiley-Blackwell","subject":"Ecology","type":"journal-article","sha":"df571d45f4bf2c1ac9d557e27bcdc454d53eff0d"}
+{"doi":"10.1111/iere.12092","title":"COLLATERAL REQUIREMENTS AND ASSET PRICES","authors":["Johannes Brumm","Michael Grill","Felix Kubler","Karl Schmedders"],"year":"2015","journal":"International Economic Review","publisher":"Wiley-Blackwell","subject":"Economics and Econometrics","type":"journal-article","sha":"4fa3a9c254c962d46f87acd7638f5ce3496451f7"}
+{"doi":"10.1111/j.0966-8373.2004.00200.x","title":"Wittgenstein on the Substance of the World","authors":["Ian Proops"],"year":"2004","journal":"European Journal of Philosophy","publisher":"Wiley-Blackwell","subject":"Philosophy","type":"journal-article","sha":"57d0b602653ba779e270abb900a1de312736a80c"}
+{"doi":"10.1111/j.1151-2916.1995.tb08922.x","title":"Thermal Expansion and Glass Transition Temperatures of Y-Mg-Si-Al-O-N Glasses","authors":["Irene M. Peterson","Tseng-Ying Tien"],"year":"1995","journal":"Journal of the American Ceramic Society","publisher":"Wiley-Blackwell","subject":"Materials Chemistry","type":"journal-article","sha":"b8a004e0225186c2c0d236f062b42948f7aa142a"}
+{"doi":"10.1111/j.1365-2605.2005.00619.x","title":"Neuroendocrine mechanisms controlling female puberty: new approaches, new concepts","authors":["Sergio R. Ojeda","Christian Roth","Alison Mungenast","Sabine Heger","Claudio Mastronardi","Anne-Simone Parent","Alejandro Lomniczi","Heike Jung"],"year":"2006","journal":"International Journal of Andrology","publisher":"Wiley-Blackwell","subject":"Urology","type":"journal-article","sha":"b1f689952cb2f4d02a6a5dbdefd7180533333e67"}
+{"doi":"10.1111/j.1365-2818.2012.03636.x","title":"Analysis of spatial structure of epidermal nerve entry point patterns based on replicated data","authors":["M. MYLLYMÄKI","I.G PANOUTSOPOULOU","A. SÄRKKÄ"],"year":"2012","journal":"Journal of Microscopy","publisher":"Wiley-Blackwell","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"fd3e8eb8f3ff38919388336f155a2de4696bbd92"}
+{"doi":"10.1111/j.1365-2834.2007.00670.x","title":"The Nursing Worklife Model: Extending and Refining a New Theory","authors":["MILISA MANOJLOVICH","HEATHER LASCHINGER"],"year":"2007","journal":"Journal of Nursing Management","publisher":"Wiley-Blackwell","subject":"Leadership and Management","type":"journal-article","sha":"906f6f19db9ae0e0d4ad9f7b52723c0715d0f82b"}
+{"doi":"10.1111/j.1365-2907.2008.00133.x","title":"Ecology and conservation of common bottlenose dolphinsTursiops truncatusin the Mediterranean Sea","authors":["GIOVANNI BEARZI","CATERINA MARIA FORTUNA","RANDALL R. REEVES"],"year":"2009","journal":"Mammal Review","publisher":"Wiley-Blackwell","subject":"Animal Science and Zoology","type":"journal-article","sha":"f2bd6d365ad7a1cf5b1b4f93ecd29517ff684fb2"}
+{"doi":"10.1111/j.1365-313x.2008.03725.x","title":"The preprophase band is a localized center of clathrin-mediated endocytosis in late prophase cells of the onion cotyledon epidermis","authors":["Ichirou Karahara","Jinsuke Suda","Hiroshi Tahara","Etsuo Yokota","Teruo Shimmen","Kazuyo Misaki","Shigenobu Yonemura","Lucas Andrew Staehelin","Yoshinobu Mineyuki"],"year":"2009","journal":"The Plant Journal","publisher":"Wiley-Blackwell","subject":"Plant Science","type":"journal-article","sha":"d038ae96b106a7d7d34e636df4a75b2df88f1095"}
+{"doi":"10.1111/j.1467-9256.2004.00199.x","title":"Politics and Power in the European Convention","authors":["Ben Crum"],"year":"2004","journal":"Politics","publisher":"SAGE Publications","subject":"Political Science and International Relations","type":"journal-article","sha":"db881b623dfe5aba8d119953f126150c08df607c"}
+{"doi":"10.1111/j.1468-0297.2009.02221.x","title":"Regulation in Happyville","authors":["François Salanié","Nicolas Treich"],"year":"2009","journal":"The Economic Journal","publisher":"Wiley-Blackwell","subject":"Economics and Econometrics","type":"journal-article","sha":"6e4f269f091b41b23f5a421a367257ace7b74208"}
+{"doi":"10.1111/j.1474-9726.2009.00459.x","title":"Different dietary restriction regimens extend lifespan by both independent and overlapping genetic pathways inC. elegans","authors":["Eric L. Greer","Anne Brunet"],"year":"2009","journal":"Aging Cell","publisher":"Wiley-Blackwell","subject":"Cell Biology","type":"journal-article","sha":"d8dc02b95fcb8e12d38a444d06aa42cd9204ec93"}
+{"doi":"10.1111/j.1475-679x.2008.00295.x","title":"Keynesian Beauty Contest, Accounting Disclosure, and Market Efficiency","authors":["PINGYANG GAO"],"year":"2008","journal":"Journal of Accounting Research","publisher":"Wiley-Blackwell","subject":"Economics and Econometrics","type":"journal-article","sha":"86efc1aab363f3f24ae8ca2c796d1d6340faad43"}
+{"doi":"10.1111/j.1523-1739.2006.00346.x","title":"Impacts of Alternative Timber Harvest Practices on Leaf-Chewing Herbivores of Oak","authors":["REBECCA E. FORKNER","ROBERT J. MARQUIS","JOHN T. LILL","JOSIANE LE CORFF"],"year":"2006","journal":"Conservation Biology","publisher":"Wiley-Blackwell","subject":"Ecology","type":"journal-article","sha":"59f9d6226b0762d27ef82c3767622b1febf3b910"}
+{"doi":"10.1111/j.1523-1739.2008.01084.x","title":"Combining the Species-Area-Habitat Relationship and Environmental Cluster Analysis to Set Conservation Priorities: a Study in the Zhoushan Archipelago, China","authors":["YOU-HUA CHEN"],"year":"2009","journal":"Conservation Biology","publisher":"Wiley-Blackwell","subject":"Ecology","type":"journal-article","sha":"d2d06507bd3db01b3c42e16cddc203b86518a138"}
+{"doi":"10.1111/j.1542-4774.2012.01075.x","title":"ROUTES OF INFECTION: EXPORTS AND HIV INCIDENCE IN SUB-SAHARAN AFRICA","authors":["Emily Oster"],"year":"2012","journal":"Journal of the European Economic Association","publisher":"Oxford University Press (OUP)","subject":"","type":"journal-article","sha":"479fd1c2a3e5b5a8c9db77441a73203fc6a5ee0e"}
+{"doi":"10.1111/j.1553-2712.2010.00907.x","title":"Palliative Care Needs of Seriously Ill, Older Adults Presenting to the Emergency Department","authors":["Corita R. Grudzen","Lynne D. Richardson","Matthew Morrison","Elizabeth Cho","R. Sean Morrison"],"year":"2010","journal":"Academic Emergency Medicine","publisher":"Wiley-Blackwell","subject":"Emergency Medicine","type":"journal-article","sha":"0516faa504116b73387c35e659d45e521066052b"}
+{"doi":"10.1111/j.1744-7348.1991.tb04848.x","title":"The use of non-destructive measurement and physiological models of yield determination to investigate factors determining differences in seed yield between genotypes of \"desi\" chickpeas (Cicer arietum)†","authors":["J. H. WILLIAMS","N. P. SAXENA"],"year":"1991","journal":"Annals of Applied Biology","publisher":"Wiley-Blackwell","subject":"Agronomy and Crop Science","type":"journal-article","sha":"f824598d76be2a195a29008252d1df198bfb5cdb"}
+{"doi":"10.1111/j.1745-4565.2010.00219.x","title":"INFLUENCE OF INTESTINAL SURFACTANT LIKE PARTICLES ON DIFFERENTIAL ACTIVATION OF SECONDARY SIGNALING MOLECULES DURING SALMONELLA TYPHIMURIUM INFECTION","authors":["M. HANIEF SOFI","ARCHANA BHATNAGAR","SAVEETA SAPRA","AKHTAR MAHMOOD","SIDHARTHA MAJUMDAR"],"year":"2010","journal":"Journal of Food Safety","publisher":"Wiley-Blackwell","subject":"Food Science","type":"journal-article","sha":"4d1e99b62b85d3767d369797d40efdff5e2ccf36"}
+{"doi":"10.1111/j.1745-6622.2003.tb00531.x","title":"RECOGNIZING MACROECONOMIC FLUCTUATIONS IN VALUE BASED MANAGEMENT","authors":["Lars Oxelheim","Clas Wihlborg"],"year":"2003","journal":"Journal of Applied Corporate Finance","publisher":"Wiley-Blackwell","subject":"","type":"journal-article","sha":"18b9ff2814b0a0c9d79efa57b171c66dc85870c0"}
+{"doi":"10.1111/j.1747-9991.2007.00090.x","title":"External World Skepticism","authors":["John Greco"],"year":"2007","journal":"Philosophy Compass","publisher":"Wiley-Blackwell","subject":"","type":"journal-article","sha":"59e77ae369baa052f602fb5ab9f23118f85142b4"}
+{"doi":"10.1111/jasp.12184","title":"Interactive effects of proactive personality and display rules on emotional labor in organizations","authors":["Kristen L. Randolph","Jason J. Dahling"],"year":"2013","journal":"Journal of Applied Social Psychology","publisher":"Wiley-Blackwell","subject":"Social Psychology","type":"journal-article","sha":"a72df616d77469f0c13ee738622ddf71ed77902e"}
+{"doi":"10.1111/poms.12160","title":"Managing Disruptions in Decentralized Supply Chains with Endogenous Supply Process Reliability","authors":["Sammi Y. Tang","Haresh Gurnani","Diwakar Gupta"],"year":"2014","journal":"Production and Operations Management","publisher":"Wiley-Blackwell","subject":"Management of Technology and Innovation","type":"journal-article","sha":"461ec751d6168f63555b157575f0912411685ee4"}
+{"doi":"10.11113/jt.v36.562","title":"Development of a Two–Stage Biomass Combustion System for Reducing The Emission Pollutant","authors":["Adi Surjosatyo","Farid Nasir Ani"],"year":"null","journal":"Jurnal Teknologi","publisher":"Penerbit UTM Press","subject":"","type":"journal-article","sha":"70d373cc1c4ef23906cce37ce290536d79353f69"}
+{"doi":"10.1116/1.3056172","title":"Imaging capability of pseudomorphic high electron mobility transistors, AlGaN∕GaN, and Si micro-Hall probes for scanning Hall probe microscopy between 25 and 125 °C","authors":["R. Akram","M. Dede","A. Oral"],"year":"2009","journal":"Journal of Vacuum Science & Technology B: Microelectronics and Nanometer Structures","publisher":"American Vacuum Society","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"f5751f540db2c99c435b200bca65bd8f167c695c"}
+{"doi":"10.1117/12.2076473","title":"Emotion-prints: interaction-driven emotion visualization on multi-touch interfaces","authors":["Daniel Cernea","Christopher Weber","Achim Ebert","Andreas Kerren"],"year":"2015","journal":"Visualization and Data Analysis 2015","publisher":"SPIE-Intl Soc Optical Eng","subject":"","type":"proceedings-article","sha":"1947f21d1f79168028210b9039832fde6987d963"}
+{"doi":"10.1117/12.317362","title":"<title>Heterodyne instrumentation at the CSO</title>","authors":["Jacob W. Kooi","P. L. Schaffer","Bruce Bumble","Rick LeDuc","Thomas G. Phillips"],"year":"1998","journal":"Advanced Technology MMW, Radio, and Terahertz Telescopes","publisher":"SPIE-Intl Soc Optical Eng","subject":"","type":"proceedings-article","sha":"ba795eb58b42e7807e1be993cb3bc02ccdfbb672"}
+{"doi":"10.1117/12.538328","title":"<title>PackBot: a versatile platform for military robotics</title>","authors":["Brian M. Yamauchi"],"year":"2004","journal":"Unmanned Ground Vehicle Technology VI","publisher":"SPIE-Intl Soc Optical Eng","subject":"","type":"proceedings-article","sha":"34a748a0d60a6ff9fab3d0edbee1cd1b8621a5b8"}
+{"doi":"10.1117/12.851939","title":"Receiver sensitivity improvement for NRZ-OOK signal by optical parametric amplifier-assisted detection","authors":["Yu Liang","P.C. Chui","K. K. Y. Wong"],"year":"2009","journal":"Passive Components and Fiber-based Devices VI","publisher":"SPIE-Intl Soc Optical Eng","subject":"","type":"proceedings-article","sha":"be538ea0b14529001bf2d46a0dc9fd843e85b0d8"}
+{"doi":"10.1117/12.857289","title":"Introducing high performance distributed logging service for ACS","authors":["Jorge A. Avarias","Joao S. López","Cristián Maureira","Heiko Sommer","Gianluca Chiozzi"],"year":"2010","journal":"Software and Cyberinfrastructure for Astronomy","publisher":"SPIE-Intl Soc Optical Eng","subject":"","type":"proceedings-article","sha":"96144660134808afbc2e48b5361e2941bd32a8b6"}
+{"doi":"10.1126/science.1074085","title":"Sustaining Fisheries Yields Over Evolutionary Time Scales","authors":["D. O. Conover"],"year":"2002","journal":"Science","publisher":"American Association for the Advancement of Science (AAAS)","subject":"General","type":"journal-article","sha":"9168b940960b29e950787ec33b8e93cc9f67d9c3"}
+{"doi":"10.1126/science.1104346","title":"Genetic Factors in Type 2 Diabetes: The End of the Beginning?","authors":["S. O'Rahilly"],"year":"2005","journal":"Science","publisher":"American Association for the Advancement of Science (AAAS)","subject":"General","type":"journal-article","sha":"7b6954e63235748f3c58a64b777016e5d72baeb2"}
+{"doi":"10.1126/science.1132362","title":"THE EARLY YEARS: Evaluating Montessori Education","authors":["A. Lillard"],"year":"2006","journal":"Science","publisher":"American Association for the Advancement of Science (AAAS)","subject":"General","type":"journal-article","sha":"9a2cbe121508e2013c42a3d2b9878d0e05173b4b"}
+{"doi":"10.1127/0935-1221/2007/0019-1763","title":"Geochemistry of mafic phenocrysts from alkaline lamprophyres of the Spanish Central System: implications on crystal fractionation, magma mixing and xenoliths entrapment within deep magma chambers","authors":["David Orejana","Carlos Villaseca","Bruce A. Paterson"],"year":"2007","journal":"European Journal of Mineralogy","publisher":"Schweizerbart","subject":"Geochemistry and Petrology","type":"journal-article","sha":"25113383c54bcf6c14d1bb4cc4e4b79deb04e825"}
+{"doi":"10.1127/0935-1221/2012/0024-2203","title":"Li-bearing tourmalines in Variscan granitic pegmatites from the Moldanubian nappes, Lower Austria","authors":["Andreas Ertl","Ralf Schuster","John M. Hughes","Thomas Ludwig","Hans-Peter Meyer","Friedrich Finger","M. Darby Dyar","Katja Ruschel","George R. Rossman","Urs Klötzli","Franz Brandstätter","Christian L. Lengauer","Ekkehart Tillmanns"],"year":"2012","journal":"European Journal of Mineralogy","publisher":"Schweizerbart","subject":"Geochemistry and Petrology","type":"journal-article","sha":"0ea9a3afc9e2507ef2eba02a6efe4a69ec9bf03a"}
+{"doi":"10.1130/rf.l006.1","title":"Crustal melting, ductile flow, and deformation in mountain belts: Cause and effect relationships","authors":["M. Searle"],"year":"2013","journal":"Lithosphere","publisher":"Geological Society of America","subject":"Geology","type":"journal-article","sha":"0bffc30eacbe575a9231e507b6e4bd33a3040f69"}
+{"doi":"10.1134/s1990478909010013","title":"Planar graph classes with the independent set problem solvable in polynomial time","authors":["V. E. Alekseev","D. S. Malyshev"],"year":"2009","journal":"Journal of Applied and Industrial Mathematics","publisher":"Pleiades Publishing Ltd","subject":"Industrial and Manufacturing Engineering","type":"journal-article","sha":"f789c4ddf84493555f6c95f1842105be104c2d25"}
+{"doi":"10.1136/amiajnl-2012-000847","title":"Self-reported fever and measured temperature in emergency department records used for syndromic surveillance: Table 1","authors":["Taha A Kass-Hout","David Buckeridge","John Brownstein","Zhiheng Xu","Paul McMurray","Charles K T Ishikawa","Julia Gunn","Barbara L Massoudi"],"year":"2012","journal":"Journal of the American Medical Informatics Association","publisher":"Oxford University Press (OUP)","subject":"Health Informatics","type":"journal-article","sha":"f906abbcabba0690e3649209125b05b13a33e6c7"}
+{"doi":"10.1136/ard.2004.031807","title":"The development of the EULAR-OMERACT rheumatoid arthritis MRI reference image atlas","authors":["P Bird"],"year":"2005","journal":"Annals of the Rheumatic Diseases","publisher":"BMJ","subject":"Immunology","type":"journal-article","sha":"d69351a9073651ece8c9ace1ba3b91cee9b5f2e3"}
+{"doi":"10.1136/bcr.09.2008.0898","title":"Primary leiomyoma of the liver: accurate preoperative diagnosis on liver biopsy","authors":["H. T Sousa","F. Portela","L. Semedo","E. Furtado","C. Marinho","M. A Cipriano","M. C Leitao"],"year":"2009","journal":"Case Reports","publisher":"BMJ","subject":"","type":"journal-article","sha":"a0e67e8f9052b62091f1e53cb63d777be3215d18"}
+{"doi":"10.1136/heartjnl-2012-302920a.71","title":"ADIPOSE DERIVED MESENCHYMAL STEM CELLS ENHANCE CARDIAC FUNCTION AFTER MYOCARDIAC INFARCTION VIA PARACRINE EFFECT","authors":["Wei Wang","Chunyu Zeng"],"year":"2012","journal":"Heart","publisher":"BMJ","subject":"Cardiology and Cardiovascular Medicine","type":"journal-article","sha":"0734aaa781354de047c576a2ecdeb9e21c0f8238"}
+{"doi":"10.1136/jme.2006.019984","title":"The expressivist objection to prenatal diagnosis: can it be laid to rest?","authors":["S Holm"],"year":"2008","journal":"Journal of Medical Ethics","publisher":"BMJ","subject":"Health Policy","type":"journal-article","sha":"a6b6f9f213695d4ed741c2aa4b2b3de4d88353be"}
+{"doi":"10.1136/jme.9.4.207","title":"Ethical aspects of clinical chemistry.","authors":["E BenGershom"],"year":"1983","journal":"Journal of Medical Ethics","publisher":"BMJ","subject":"Health Policy","type":"journal-article","sha":"93a297d7f9aeb9280d6137b71a34168b8b48623d"}
+{"doi":"10.1136/jnnp.49.7.824","title":"Auditory dysfunction in Ramsay Hunt syndrome.","authors":["V J Iragui"],"year":"1986","journal":"Journal of Neurology, Neurosurgery & Psychiatry","publisher":"BMJ","subject":"Medicine(all)","type":"journal-article","sha":"bca40e5ac1cb9c42f462295a831e32c052581168"}
+{"doi":"10.1136/thorax.56.4.266","title":"Relationship between anxiety, depression, and morbidity in adult asthma patients","authors":["L D Rimington"],"year":"2001","journal":"Thorax","publisher":"BMJ","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"fdd7b113f15601df6d5ee228be864daf494ada6d"}
+{"doi":"10.1136/thoraxjnl-2013-204974","title":"Cochrane corner: is integrated disease management for patients with COPD effective?: Table 1","authors":["Annemarije L Kruis","Nynke Smidt","Willem J J Assendelft","Jacobijn Gussekloo","Melinde R S Boland","Maureen Rutten-van Mölken","Niels H Chavannes"],"year":"2014","journal":"Thorax","publisher":"BMJ","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"8f6e0cbc715d86cad1698e18df5e4aa2ac2022c3"}
+{"doi":"10.1137/040607587","title":"Deriving Information about Architecture from Activity Patterns in Coupled Cell Systems","authors":["Kresimir Josic","Jonathan Rubin"],"year":"2005","journal":"SIAM Journal on Applied Dynamical Systems","publisher":"Society for Industrial & Applied Mathematics (SIAM)","subject":"Modelling and Simulation","type":"journal-article","sha":"eca4739f083eb4efcbd90295d752532a16b92346"}
+{"doi":"10.1137/040611100","title":"A Priori Error Estimates for the Finite Element Discretization of Elliptic Parameter Identification Problems with Pointwise Measurements","authors":["R. Rannacher","B. Vexler"],"year":"2005","journal":"SIAM Journal on Control and Optimization","publisher":"Society for Industrial & Applied Mathematics (SIAM)","subject":"Control and Optimization","type":"journal-article","sha":"d3b3aa88fcba0925645d8fc9ff998b038ce52e5c"}
+{"doi":"10.1137/1.9781611972825.91","title":"Regularized Structured Output Learning with Partial Labels","authors":["Sundararajan Sellamanickam","Charu Tiwari","Sathiya Keerthi Selvaraj"],"year":"2012","journal":"Proceedings of the 2012 SIAM International Conference on Data Mining","publisher":"Society for Industrial & Applied Mathematics (SIAM)","subject":"","type":"book-chapter","sha":"8771f2298a70d7939e47bd195eb34b3bcd9b4e3b"}
+{"doi":"10.1137/110836602","title":"An Optimal Algorithm for Constrained Differentiable Convex Optimization","authors":["Clóvis C. Gonzaga","Elizabeth W. Karas","Diane R. Rossetto"],"year":"2013","journal":"SIAM Journal on Optimization","publisher":"Society for Industrial & Applied Mathematics (SIAM)","subject":"Theoretical Computer Science","type":"journal-article","sha":"712ed5a20dfa8e63c47d0296d5766055e76026da"}
+{"doi":"10.1137/s0895479893252623","title":"On the Stability of Cholesky Factorization for Symmetric Quasidefinite Systems","authors":["Philip E. Gill","Michael A. Saunders","Joseph R. Shinnerl"],"year":"1996","journal":"SIAM Journal on Matrix Analysis and Applications","publisher":"Society for Industrial & Applied Mathematics (SIAM)","subject":"Analysis","type":"journal-article","sha":"2a4e5e873553f52aa337a5c5228b409b7d061028"}
+{"doi":"10.1139/f00-215","title":"Allometry of natural mortality as a basis for assessing optimal release size in fish-stocking programmes","authors":["Kai Lorenzen"],"year":"2000","journal":"Canadian Journal of Fisheries and Aquatic Sciences","publisher":"Canadian Science Publishing","subject":"Aquatic Science","type":"journal-article","sha":"92ec6a2aa0581d32ac90c4184e4b94b221ec5a68"}
+{"doi":"10.1140/epjc/s10052-012-2241-5","title":"ATLAS search for a heavy gauge boson decaying to a charged lepton and a neutrino in pp collisions at $\\\\sqrt{s} = 7\\\\ \\\\mathrm{TeV}$","authors":[" The ATLAS Collaboration","G. Aad","T. Abajyan","B. Abbott","J. Abdallah","S. Abdel Khalek","A. A. Abdelalim","O. Abdinov","R. Aben","B. Abi","M. Abolins","O. S. AbouZeid","H. Abramowicz","H. Abreu","B. S. Acharya","L. Adamczyk","D. L. Adams","T. N. Addy","J. Adelman","S. Adomeit","P. Adragna","T. Adye","S. Aefsky","J. A. Aguilar-Saavedra","M. Agustoni","M. Aharrouche","S. P. Ahlen","F. Ahles","A. Ahmad","M. Ahsan","G. Aielli","T. Akdogan","T. P. A. Ã…kesson","G. Akimoto","A. V. Akimov","M. S. Alam","M. A. Alam","J. Albert","S. Albrand","M. Aleksa","I. N. Aleksandrov","F. Alessandria","C. Alexa","G. Alexander","G. Alexandre","T. Alexopoulos","M. Alhroob","M. Aliev","G. Alimonti","J. Alison","B. M. M. Allbrooke","P. P. Allport","S. E. Allwood-Spiers","J. Almond","A. Aloisio","R. Alon","A. Alonso","F. Alonso","A. Altheimer","B. Alvarez Gonzalez","M. G. Alviggi","K. Amako","C. Amelung","V. V. Ammosov","S. P. Amor Dos Santos","A. Amorim","N. Amram","C. Anastopoulos","L. S. Ancu","N. Andari","T. Andeen","C. F. Anders","G. Anders","K. J. Anderson","A. Andreazza","V. Andrei","M-L. Andrieux","X. S. Anduaga","P. Anger","A. Angerami","F. Anghinolfi","A. Anisenkov","N. Anjos","A. Annovi","A. Antonaki","M. Antonelli","A. Antonov","J. Antos","F. Anulli","M. Aoki","S. Aoun","L. Aperio Bella","R. Apolle","G. Arabidze","I. Aracena","Y. Arai","A. T. H. Arce","S. Arfaoui","J-F. Arguin","E. Arik","M. Arik","A. J. Armbruster","O. Arnaez","V. Arnal","C. Arnault","A. Artamonov","G. Artoni","D. Arutinov","S. Asai","R. Asfandiyarov","S. Ask","B. Ã…sman","L. Asquith","K. Assamagan","A. Astbury","M. Atkinson","B. Aubert","E. Auge","K. Augsten","M. Aurousseau","G. Avolio","R. Avramidou","D. Axen","G. Azuelos","Y. Azuma","M. A. Baak","G. Baccaglioni","C. Bacci","A. M. Bach","H. Bachacou","K. Bachas","M. Backes","M. Backhaus","E. Badescu","P. Bagnaia","S. Bahinipati","Y. Bai","D. C. Bailey","T. Bain","J. T. Baines","O. K. Baker","M. D. Baker","S. Baker","E. Banas","P. Banerjee","Sw. Banerjee","D. Banfi","A. Bangert","V. Bansal","H. S. Bansil","L. Barak","S. P. Baranov","A. Barbaro Galtieri","T. Barber","E. L. Barberio","D. Barberis","M. Barbero","D. Y. Bardin","T. Barillari","M. Barisonzi","T. Barklow","N. Barlow","B. M. Barnett","R. M. Barnett","A. Baroncelli","G. Barone","A. J. Barr","F. Barreiro","J. Barreiro Guimarães da Costa","P. Barrillon","R. Bartoldus","A. E. Barton","V. Bartsch","A. Basye","R. L. Bates","L. Batkova","J. R. Batley","A. Battaglia","M. Battistin","F. Bauer","H. S. Bawa","S. Beale","T. Beau","P. H. Beauchemin","R. Beccherle","P. Bechtle","H. P. Beck","A. K. Becker","S. Becker","M. Beckingham","K. H. Becks","A. J. Beddall","A. Beddall","S. Bedikian","V. A. Bednyakov","C. P. Bee","L. J. Beemster","M. Begel","S. Behar Harpaz","P. K. Behera","M. Beimforde","C. Belanger-Champagne","P. J. Bell","W. H. Bell","G. Bella","L. Bellagamba","F. Bellina","M. Bellomo","A. Belloni","O. Beloborodova","K. Belotskiy","O. Beltramello","O. Benary","D. Benchekroun","K. Bendtz","N. Benekos","Y. Benhammou","E. Benhar Noccioli","J. A. Benitez Garcia","D. P. Benjamin","M. Benoit","J. R. Bensinger","K. Benslama","S. Bentvelsen","D. Berge","E. Bergeaas Kuutmann","N. Berger","F. Berghaus","E. Berglund","J. Beringer","P. Bernat","R. Bernhard","C. Bernius","T. Berry","C. Bertella","A. Bertin","F. Bertolucci","M. I. Besana","G. J. Besjes","N. Besson","S. Bethke","W. Bhimji","R. M. Bianchi","M. Bianco","O. Biebel","S. P. Bieniek","K. Bierwagen","J. Biesiada","M. Biglietti","H. Bilokon","M. Bindi","S. Binet","A. Bingul","C. Bini","C. Biscarat","B. Bittner","K. M. Black","R. E. Blair","J.-B. Blanchard","G. Blanchot","T. Blazek","I. Bloch","C. Blocker","J. Blocki","A. Blondel","W. Blum","U. Blumenschein","G. J. Bobbink","V. B. Bobrovnikov","S. S. Bocchetta","A. Bocci","C. R. Boddy","M. Boehler","J. Boek","N. Boelaert","J. A. Bogaerts","A. Bogdanchikov","A. Bogouch","C. Bohm","J. Bohm","V. Boisvert","T. Bold","V. Boldea","N. M. Bolnet","M. Bomben","M. Bona","M. Boonekamp","S. Bordoni","C. Borer","A. Borisov","G. Borissov","I. Borjanovic","M. Borri","S. Borroni","V. Bortolotto","K. Bos","D. Boscherini","M. Bosman","H. Boterenbrood","J. Bouchami","J. Boudreau","E. V. Bouhova-Thacker","D. Boumediene","C. Bourdarios","N. Bousson","A. Boveia","J. Boyd","I. R. Boyko","I. Bozovic-Jelisavcic","J. Bracinik","P. Branchini","G. W. Brandenburg","A. Brandt","G. Brandt","O. Brandt","U. Bratzler","B. Brau","J. E. Brau","H. M. Braun","S. F. Brazzale","B. Brelier","J. Bremer","K. Brendlinger","R. Brenner","S. Bressler","D. Britton","F. M. Brochu","I. Brock","R. Brock","F. Broggi","C. Bromberg","J. Bronner","G. Brooijmans","T. Brooks","W. K. Brooks","G. Brown","H. Brown","P. A. Bruckman de Renstrom","D. Bruncko","R. Bruneliere","S. Brunet","A. Bruni","G. Bruni","M. Bruschi","T. Buanes","Q. Buat","F. Bucci","J. Buchanan","P. Buchholz","R. M. Buckingham","A. G. Buckley","S. I. Buda","I. A. Budagov","B. Budick","V. Büscher","L. Bugge","M. K. Bugge","O. Bulekov","A. C. Bundock","M. Bunse","T. Buran","H. Burckhart","S. Burdin","T. Burgess","S. Burke","E. Busato","P. Bussey","C. P. Buszello","B. Butler","J. M. Butler","C. M. Buttar","J. M. Butterworth","W. Buttinger","S. Cabrera Urbán","D. Caforio","O. Cakir","P. Calafiura","G. Calderini","P. Calfayan","R. Calkins","L. P. Caloba","R. Caloi","D. Calvet","S. Calvet","R. Camacho Toro","P. Camarri","D. Cameron","L. M. Caminada","R. Caminal Armadans","S. Campana","M. Campanelli","V. Canale","F. Canelli","A. Canepa","J. Cantero","R. Cantrill","L. Capasso","M. D. M. Capeans Garrido","I. Caprini","M. Caprini","D. Capriotti","M. Capua","R. Caputo","R. Cardarelli","T. Carli","G. Carlino","L. Carminati","B. Caron","S. Caron","E. Carquin","G. D. Carrillo Montoya","A. A. Carter","J. R. Carter","J. Carvalho","D. Casadei","M. P. Casado","M. Cascella","C. Caso","A. M. Castaneda Hernandez","E. Castaneda-Miranda","V. Castillo Gimenez","N. F. Castro","G. Cataldi","P. Catastini","A. Catinaccio","J. R. Catmore","A. Cattai","G. Cattani","S. Caughron","V. Cavaliere","P. Cavalleri","D. Cavalli","M. Cavalli-Sforza","V. Cavasinni","F. Ceradini","A. S. Cerqueira","A. Cerri","L. Cerrito","F. Cerutti","S. A. Cetin","A. Chafaq","D. Chakraborty","I. Chalupkova","K. Chan","P. Chang","B. Chapleau","J. D. Chapman","J. W. Chapman","E. Chareyre","D. G. Charlton","V. Chavda","C. A. Chavez Barajas","S. Cheatham","S. Chekanov","S. V. Chekulaev","G. A. Chelkov","M. A. Chelstowska","C. Chen","H. Chen","S. Chen","X. Chen","Y. Chen","A. Cheplakov","R. Cherkaoui El Moursli","V. Chernyatin","E. Cheu","S. L. Cheung","L. Chevalier","G. Chiefari","L. Chikovani","J. T. Childers","A. Chilingarov","G. Chiodini","A. S. Chisholm","R. T. Chislett","A. Chitan","M. V. Chizhov","G. Choudalakis","S. Chouridou","I. A. Christidi","A. Christov","D. Chromek-Burckhart","M. L. Chu","J. Chudoba","G. Ciapetti","A. K. Ciftci","R. Ciftci","D. Cinca","V. Cindro","C. Ciocca","A. Ciocio","M. Cirilli","P. Cirkovic","Z. H. Citron","M. Citterio","M. Ciubancan","A. Clark","P. J. Clark","R. N. Clarke","W. Cleland","J. C. Clemens","B. Clement","C. Clement","Y. Coadou","M. Cobal","A. Coccaro","J. Cochran","L. Coffey","J. G. Cogan","J. Coggeshall","E. Cogneras","J. Colas","S. Cole","A. P. Colijn","N. J. Collins","C. Collins-Tooth","J. Collot","T. Colombo","G. Colon","P. Conde Muiño","E. Coniavitis","M. C. Conidi","S. M. Consonni","V. Consorti","S. Constantinescu","C. Conta","G. Conti","F. Conventi","M. Cooke","B. D. Cooper","A. M. Cooper-Sarkar","K. Copic","T. Cornelissen","M. Corradi","F. Corriveau","A. Cortes-Gonzalez","G. Cortiana","G. Costa","M. J. Costa","D. Costanzo","D. Côté","L. Courneyea","G. Cowan","C. Cowden","B. E. Cox","K. Cranmer","F. Crescioli","M. Cristinziani","G. Crosetti","S. Crépé-Renaudin","C.-M. Cuciuc","C. Cuenca Almenar","T. Cuhadar Donszelmann","M. Curatolo","C. J. Curtis","C. Cuthbert","P. Cwetanski","H. Czirr","P. Czodrowski","Z. Czyczula","S. D’Auria","M. D’Onofrio","A. D’Orazio","M. J. Da Cunha Sargedas De Sousa","C. Da Via","W. Dabrowski","A. Dafinca","T. Dai","C. Dallapiccola","M. Dam","M. Dameri","D. S. Damiani","H. O. Danielsson","V. Dao","G. Darbo","G. L. Darlea","J. A. Dassoulas","W. Davey","T. Davidek","N. Davidson","R. Davidson","E. Davies","M. Davies","O. Davignon","A. R. Davison","Y. Davygora","E. Dawe","I. Dawson","R. K. Daya-Ishmukhametova","K. De","R. de Asmundis","S. De Castro","S. De Cecco","J. de Graat","N. De Groot","P. de Jong","C. De La Taille","H. De la Torre","F. De Lorenzi","L. de Mora","L. De Nooij","D. De Pedis","A. De Salvo","U. De Sanctis","A. De Santo","J. B. De Vivie De Regie","G. De Zorzi","W. J. Dearnaley","R. Debbe","C. Debenedetti","B. Dechenaux","D. V. Dedovich","J. Degenhardt","C. Del Papa","J. Del Peso","T. Del Prete","T. Delemontex","M. Deliyergiyev","A. Dell’Acqua","L. Dell’Asta","M. Della Pietra","D. della Volpe","M. Delmastro","P. A. Delsart","C. Deluca","S. Demers","M. Demichev","B. Demirkoz","J. Deng","S. P. Denisov","D. Derendarz","J. E. Derkaoui","F. Derue","P. Dervan","K. Desch","E. Devetak","P. O. Deviveiros","A. Dewhurst","B. DeWilde","S. Dhaliwal","R. Dhullipudi","A. Di Ciaccio","L. Di Ciaccio","A. Di Girolamo","B. Di Girolamo","S. Di Luise","A. Di Mattia","B. Di Micco","R. Di Nardo","A. Di Simone","R. Di Sipio","M. A. Diaz","E. B. Diehl","J. Dietrich","T. A. Dietzsch","S. Diglio","K. Dindar Yagci","J. Dingfelder","F. Dinut","C. Dionisi","P. Dita","S. Dita","F. Dittus","F. Djama","T. Djobava","M. A. B. do Vale","A. Do Valle Wemans","T. K. O. Doan","M. Dobbs","R. Dobinson","D. Dobos","E. Dobson","J. Dodd","C. Doglioni","T. Doherty","Y. Doi","J. Dolejsi","I. Dolenc","Z. Dolezal","B. A. Dolgoshein","T. Dohmae","M. Donadelli","J. Donini","J. Dopke","A. Doria","A. Dos Anjos","A. Dotti","M. T. Dova","A. D. Doxiadis","A. T. Doyle","N. Dressnandt","M. Dris","J. Dubbert","S. Dube","E. Duchovni","G. Duckeck","D. Duda","A. Dudarev","F. Dudziak","M. Dührssen","I. P. Duerdoth","L. Duflot","M-A. Dufour","L. Duguid","M. Dunford","H. Duran Yildiz","R. Duxfield","M. Dwuznik","F. Dydak","M. Düren","W. L. Ebenstein","J. Ebke","S. Eckweiler","K. Edmonds","W. Edson","C. A. Edwards","N. C. Edwards","W. Ehrenfeld","T. Eifert","G. Eigen","K. Einsweiler","E. Eisenhandler","T. Ekelof","M. El Kacimi","M. Ellert","S. Elles","F. Ellinghaus","K. Ellis","N. Ellis","J. Elmsheuser","M. Elsing","D. Emeliyanov","R. Engelmann","A. Engl","B. Epp","J. Erdmann","A. Ereditato","D. Eriksson","J. Ernst","M. Ernst","J. Ernwein","D. Errede","S. Errede","E. Ertel","M. Escalier","H. Esch","C. Escobar","X. Espinal Curull","B. Esposito","F. Etienne","A. I. Etienvre","E. Etzion","D. Evangelakou","H. Evans","L. Fabbri","C. Fabre","R. M. Fakhrutdinov","S. Falciano","Y. Fang","M. Fanti","A. Farbin","A. Farilla","J. Farley","T. Farooque","S. Farrell","S. M. Farrington","P. Farthouat","F. Fassi","P. Fassnacht","D. Fassouliotis","B. Fatholahzadeh","A. Favareto","L. Fayard","S. Fazio","R. Febbraro","P. Federic","O. L. Fedin","W. Fedorko","M. Fehling-Kaschek","L. Feligioni","D. Fellmann","C. Feng","E. J. Feng","A. B. Fenyuk","J. Ferencei","W. Fernando","S. Ferrag","J. Ferrando","V. Ferrara","A. Ferrari","P. Ferrari","R. Ferrari","D. E. Ferreira de Lima","A. Ferrer","D. Ferrere","C. Ferretti","A. Ferretto Parodi","M. Fiascaris","F. Fiedler","A. FilipÄiÄ","F. Filthaut","M. Fincke-Keeler","M. C. N. Fiolhais","L. Fiorini","A. Firan","G. Fischer","M. J. Fisher","M. Flechl","I. Fleck","J. Fleckner","P. Fleischmann","S. Fleischmann","T. Flick","A. Floderus","L. R. Flores Castillo","M. J. Flowerdew","T. Fonseca Martin","A. Formica","A. Forti","D. Fortin","D. Fournier","A. J. Fowler","H. Fox","P. Francavilla","M. Franchini","S. Franchino","D. Francis","T. Frank","S. Franz","M. Fraternali","S. Fratina","S. T. French","C. Friedrich","F. Friedrich","R. Froeschl","D. Froidevaux","J. A. Frost","C. Fukunaga","E. Fullana Torregrosa","B. G. Fulsom","J. Fuster","C. Gabaldon","O. Gabizon","T. Gadfort","S. Gadomski","G. Gagliardi","P. Gagnon","C. Galea","B. Galhardo","E. J. Gallas","V. Gallo","B. J. Gallop","P. Gallus","K. K. Gan","Y. S. Gao","A. Gaponenko","F. Garberson","M. Garcia-Sciveres","C. García","J. E. García Navarro","R. W. Gardner","N. Garelli","H. Garitaonandia","V. Garonne","C. Gatti","G. Gaudio","B. Gaur","L. Gauthier","P. Gauzzi","I. L. Gavrilenko","C. Gay","G. Gaycken","E. N. Gazis","P. Ge","Z. Gecse","C. N. P. Gee","D. A. A. Geerts","Ch. Geich-Gimbel","K. Gellerstedt","C. Gemme","A. Gemmell","M. H. Genest","S. Gentile","M. George","S. George","P. Gerlach","A. Gershon","C. Geweniger","H. Ghazlane","N. Ghodbane","B. Giacobbe","S. Giagu","V. Giakoumopoulou","V. Giangiobbe","F. Gianotti","B. Gibbard","A. Gibson","S. M. Gibson","M. Gilchriese","D. Gillberg","A. R. Gillman","D. M. Gingrich","J. Ginzburg","N. Giokaris","M. P. Giordani","R. Giordano","F. M. Giorgi","P. Giovannini","P. F. Giraud","D. Giugni","M. Giunta","P. Giusti","B. K. Gjelsten","L. K. Gladilin","C. Glasman","J. Glatzer","A. Glazov","K. W. Glitza","G. L. Glonti","J. R. Goddard","J. Godfrey","J. Godlewski","M. Goebel","T. Göpfert","C. Goeringer","C. Gössling","S. Goldfarb","T. Golling","A. Gomes","L. S. Gomez Fajardo","R. Gonçalo","J. Goncalves Pinto Firmino Da Costa","L. Gonella","S. González de la Hoz","G. Gonzalez Parra","M. L. Gonzalez Silva","S. Gonzalez-Sevilla","J. J. Goodson","L. Goossens","P. A. Gorbounov","H. A. Gordon","I. Gorelov","G. Gorfine","B. Gorini","E. Gorini","A. GoriÅ¡ek","E. Gornicki","B. Gosdzik","A. T. Goshaw","M. Gosselink","M. I. Gostkin","I. Gough Eschrich","M. Gouighri","D. Goujdami","M. P. Goulette","A. G. Goussiou","C. Goy","S. Gozpinar","I. Grabowska-Bold","P. Grafström","K-J. Grahn","F. Grancagnolo","S. Grancagnolo","V. Grassi","V. Gratchev","N. Grau","H. M. Gray","J. A. Gray","E. Graziani","O. G. Grebenyuk","T. Greenshaw","Z. D. Greenwood","K. Gregersen","I. M. Gregor","P. Grenier","J. Griffiths","N. Grigalashvili","A. A. Grillo","S. Grinstein","Ph. Gris","Y. V. Grishkevich","J.-F. Grivaz","E. Gross","J. Grosse-Knetter","J. Groth-Jensen","K. Grybel","D. Guest","C. Guicheney","S. Guindon","U. Gul","H. Guler","J. Gunther","B. Guo","J. Guo","P. Gutierrez","N. Guttman","O. Gutzwiller","C. Guyot","C. Gwenlan","C. B. Gwilliam","A. Haas","S. Haas","C. Haber","H. K. Hadavand","D. R. Hadley","P. Haefner","F. Hahn","S. Haider","Z. Hajduk","H. Hakobyan","D. Hall","J. Haller","K. Hamacher","P. Hamal","K. Hamano","M. Hamer","A. Hamilton","S. Hamilton","L. Han","K. Hanagaki","K. Hanawa","M. Hance","C. Handel","P. Hanke","J. R. Hansen","J. B. Hansen","J. D. Hansen","P. H. Hansen","P. Hansson","K. Hara","G. A. Hare","T. Harenberg","S. Harkusha","D. Harper","R. D. Harrington","O. M. Harris","J. Hartert","F. Hartjes","T. Haruyama","A. Harvey","S. Hasegawa","Y. Hasegawa","S. Hassani","S. Haug","M. Hauschild","R. Hauser","M. Havranek","C. M. Hawkes","R. J. Hawkings","A. D. Hawkins","T. Hayakawa","T. Hayashi","D. Hayden","C. P. Hays","H. S. Hayward","S. J. Haywood","S. J. Head","V. Hedberg","L. Heelan","S. Heim","B. Heinemann","S. Heisterkamp","L. Helary","C. Heller","M. Heller","S. Hellman","D. Hellmich","C. Helsens","R. C. W. Henderson","M. Henke","A. Henrichs","A. M. Henriques Correia","S. Henrot-Versille","C. Hensel","T. Henß","C. M. Hernandez","Y. Hernández Jiménez","R. Herrberg","G. Herten","R. Hertenberger","L. Hervas","G. G. Hesketh","N. P. Hessey","E. Higón-Rodriguez","J. C. Hill","K. H. Hiller","S. Hillert","S. J. Hillier","I. Hinchliffe","E. Hines","M. Hirose","F. Hirsch","D. Hirschbuehl","J. Hobbs","N. Hod","M. C. Hodgkinson","P. Hodgson","A. Hoecker","M. R. Hoeferkamp","J. Hoffman","D. Hoffmann","M. Hohlfeld","M. Holder","S. O. Holmgren","T. Holy","J. L. Holzbauer","T. M. Hong","L. Hooft van Huysduynen","S. Horner","J-Y. Hostachy","S. Hou","A. Hoummada","J. Howard","J. Howarth","I. Hristova","J. Hrivnac","T. Hryn’ova","P. J. Hsu","S.-C. Hsu","D. Hu","Z. Hubacek","F. Hubaut","F. Huegging","A. Huettmann","T. B. Huffman","E. W. Hughes","G. Hughes","M. Huhtinen","M. Hurwitz","U. Husemann","N. Huseynov","J. Huston","J. Huth","G. Iacobucci","G. Iakovidis","M. Ibbotson","I. Ibragimov","L. Iconomidou-Fayard","J. Idarraga","P. Iengo","O. Igonkina","Y. Ikegami","M. Ikeno","D. Iliadis","N. Ilic","T. Ince","J. Inigo-Golfin","P. Ioannou","M. Iodice","K. Iordanidou","V. Ippolito","A. Irles Quiles","C. Isaksson","M. Ishino","M. Ishitsuka","R. Ishmukhametov","C. Issever","S. Istin","A. V. Ivashin","W. Iwanski","H. Iwasaki","J. M. Izen","V. Izzo","B. Jackson","J. N. Jackson","P. Jackson","M. R. Jaekel","V. Jain","K. Jakobs","S. Jakobsen","T. Jakoubek","J. Jakubek","D. K. Jana","E. Jansen","H. Jansen","A. Jantsch","M. Janus","G. Jarlskog","L. Jeanty","I. Jen-La Plante","D. Jennens","P. Jenni","A. E. Loevschall-Jensen","P. Jež","S. Jézéquel","M. K. Jha","H. Ji","W. Ji","J. Jia","Y. Jiang","M. Jimenez Belenguer","S. Jin","O. Jinnouchi","M. D. Joergensen","D. Joffe","M. Johansen","K. E. Johansson","P. Johansson","S. Johnert","K. A. Johns","K. Jon-And","G. Jones","R. W. L. Jones","T. J. Jones","C. Joram","P. M. Jorge","K. D. Joshi","J. Jovicevic","T. Jovin","X. Ju","C. A. Jung","R. M. Jungst","V. Juranek","P. Jussel","A. Juste Rozas","S. Kabana","M. Kaci","A. Kaczmarska","P. Kadlecik","M. Kado","H. Kagan","M. Kagan","E. Kajomovitz","S. Kalinin","L. V. Kalinovskaya","S. Kama","N. Kanaya","M. Kaneda","S. Kaneti","T. Kanno","V. A. Kantserov","J. Kanzaki","B. Kaplan","A. Kapliy","J. Kaplon","D. Kar","M. Karagounis","K. Karakostas","M. Karnevskiy","V. Kartvelishvili","A. N. Karyukhin","L. Kashif","G. Kasieczka","R. D. Kass","A. Kastanas","M. Kataoka","Y. Kataoka","E. Katsoufis","J. Katzy","V. Kaushik","K. Kawagoe","T. Kawamoto","G. Kawamura","M. S. Kayl","S. Kazama","V. A. Kazanin","M. Y. Kazarinov","R. Keeler","P. T. Keener","R. Kehoe","M. Keil","G. D. Kekelidze","J. S. Keller","M. Kenyon","O. Kepka","N. Kerschen","B. P. KerÅ¡evan","S. Kersten","K. Kessoku","J. Keung","F. Khalil-zada","H. Khandanyan","A. Khanov","D. Kharchenko","A. Khodinov","A. Khomich","T. J. Khoo","G. Khoriauli","A. Khoroshilov","V. Khovanskiy","E. Khramov","J. Khubua","H. Kim","S. H. Kim","N. Kimura","O. Kind","B. T. King","M. King","R. S. B. King","J. Kirk","A. E. Kiryunin","T. Kishimoto","D. Kisielewska","T. Kitamura","T. Kittelmann","K. Kiuchi","E. Kladiva","M. Klein","U. Klein","K. Kleinknecht","M. Klemetti","A. Klier","P. Klimek","A. Klimentov","R. Klingenberg","J. A. Klinger","E. B. Klinkby","T. Klioutchnikova","P. F. Klok","S. Klous","E.-E. Kluge","T. Kluge","P. Kluit","S. Kluth","N. S. Knecht","E. Kneringer","E. B. F. G. Knoops","A. Knue","B. R. Ko","T. Kobayashi","M. Kobel","M. Kocian","P. Kodys","K. Köneke","A. C. König","S. Koenig","L. Köpke","F. Koetsveld","P. Koevesarki","T. Koffas","E. Koffeman","L. A. Kogan","S. Kohlmann","F. Kohn","Z. Kohout","T. Kohriki","T. Koi","G. M. Kolachev","H. Kolanoski","V. Kolesnikov","I. Koletsou","J. Koll","A. A. Komar","Y. Komori","T. Kondo","T. Kono","A. I. Kononov","R. Konoplich","N. Konstantinidis","S. Koperny","K. Korcyl","K. Kordas","A. Korn","A. Korol","I. Korolkov","E. V. Korolkova","V. A. Korotkov","O. Kortner","S. Kortner","V. V. Kostyukhin","S. Kotov","V. M. Kotov","A. Kotwal","C. Kourkoumelis","V. Kouskoura","A. Koutsman","R. Kowalewski","T. Z. Kowalski","W. Kozanecki","A. S. Kozhin","V. Kral","V. A. Kramarenko","G. Kramberger","M. W. Krasny","A. Krasznahorkay","J. K. Kraus","S. Kreiss","F. Krejci","J. Kretzschmar","N. Krieger","P. Krieger","K. Kroeninger","H. Kroha","J. Kroll","J. Kroseberg","J. Krstic","U. Kruchonak","H. Krüger","T. Kruker","N. Krumnack","Z. V. Krumshteyn","T. Kubota","S. Kuday","S. Kuehn","A. Kugel","T. Kuhl","D. Kuhn","V. Kukhtin","Y. Kulchitsky","S. Kuleshov","C. Kummer","M. Kuna","J. Kunkle","A. Kupco","H. Kurashige","M. Kurata","Y. A. Kurochkin","V. Kus","E. S. Kuwertz","M. Kuze","J. Kvita","R. Kwee","A. La Rosa","L. La Rotonda","L. Labarga","J. Labbe","S. Lablak","C. Lacasta","F. Lacava","H. Lacker","D. Lacour","V. R. Lacuesta","E. Ladygin","R. Lafaye","B. Laforge","T. Lagouri","S. Lai","E. Laisne","M. Lamanna","L. Lambourne","C. L. Lampen","W. Lampl","E. Lancon","U. Landgraf","M. P. J. Landon","J. L. Lane","V. S. Lang","C. Lange","A. J. Lankford","F. Lanni","K. Lantzsch","S. Laplace","C. Lapoire","J. F. Laporte","T. Lari","A. Larner","M. Lassnig","P. Laurelli","V. Lavorini","W. Lavrijsen","P. Laycock","O. Le Dortz","E. Le Guirriec","E. Le Menedeu","T. LeCompte","F. Ledroit-Guillon","H. Lee","J. S. H. Lee","S. C. Lee","L. Lee","M. Lefebvre","M. Legendre","F. Legger","C. Leggett","M. Lehmacher","G. Lehmann Miotto","X. Lei","M. A. L. Leite","R. Leitner","D. Lellouch","B. Lemmer","V. Lendermann","K. J. C. Leney","T. Lenz","G. Lenzen","B. Lenzi","K. Leonhardt","S. Leontsinis","F. Lepold","C. Leroy","J-R. Lessard","C. G. Lester","C. M. Lester","J. Levêque","D. Levin","L. J. Levinson","A. Lewis","G. H. Lewis","A. M. Leyko","M. Leyton","B. Li","H. Li","S. Li","X. Li","Z. Liang","H. Liao","B. Liberti","P. Lichard","M. Lichtnecker","K. Lie","W. Liebig","C. Limbach","A. Limosani","M. Limper","S. C. Lin","F. Linde","J. T. Linnemann","E. Lipeles","A. Lipniacka","T. M. Liss","D. Lissauer","A. Lister","A. M. Litke","C. Liu","D. Liu","H. Liu","J. B. Liu","L. Liu","M. Liu","Y. Liu","M. Livan","S. S. A. Livermore","A. Lleres","J. Llorente Merino","S. L. Lloyd","E. Lobodzinska","P. Loch","W. S. Lockman","T. Loddenkoetter","F. K. Loebinger","A. Loginov","C. W. Loh","T. Lohse","K. Lohwasser","M. Lokajicek","V. P. Lombardo","R. E. Long","L. Lopes","D. Lopez Mateos","J. Lorenz","N. Lorenzo Martinez","M. Losada","P. Loscutoff","F. Lo Sterzo","M. J. Losty","X. Lou","A. Lounis","K. F. Loureiro","J. Love","P. A. Love","A. J. Lowe","F. Lu","H. J. Lubatti","C. Luci","A. Lucotte","A. Ludwig","D. Ludwig","I. Ludwig","J. Ludwig","F. Luehring","G. Luijckx","W. Lukas","L. Luminari","E. Lund","B. Lund-Jensen","B. Lundberg","J. Lundberg","O. Lundberg","J. Lundquist","M. Lungwitz","D. Lynn","E. Lytken","H. Ma","L. L. Ma","G. Maccarrone","A. Macchiolo","B. MaÄek","J. Machado Miguens","R. Mackeprang","R. J. Madaras","H. J. Maddocks","W. F. Mader","R. Maenner","T. Maeno","P. Mättig","S. Mättig","L. Magnoni","E. Magradze","K. Mahboubi","J. Mahlstedt","S. Mahmoud","G. Mahout","C. Maiani","C. Maidantchik","A. Maio","S. Majewski","Y. Makida","N. Makovec","P. Mal","B. Malaescu","Pa. Malecki","P. Malecki","V. P. Maleev","F. Malek","U. Mallik","D. Malon","C. Malone","S. Maltezos","V. Malyshev","S. Malyukov","R. Mameghani","J. Mamuzic","A. Manabe","L. Mandelli","I. Mandić","R. Mandrysch","J. Maneira","A. Manfredini","P. S. Mangeard","L. Manhaes de Andrade Filho","J. A. Manjarres Ramos","A. Mann","P. M. Manning","A. Manousakis-Katsikakis","B. Mansoulie","A. Mapelli","L. Mapelli","L. March","J. F. Marchand","F. Marchese","G. Marchiori","M. Marcisovsky","C. P. Marino","F. Marroquim","Z. Marshall","F. K. Martens","L. F. Marti","S. Marti-Garcia","B. Martin","B. Martin","J. P. Martin","T. A. Martin","V. J. Martin","B. Martin dit Latour","S. Martin-Haugh","M. Martinez","V. Martinez Outschoorn","A. C. Martyniuk","M. Marx","F. Marzano","A. Marzin","L. Masetti","T. Mashimo","R. Mashinistov","J. Masik","A. L. Maslennikov","I. Massa","G. Massaro","N. Massol","P. Mastrandrea","A. Mastroberardino","T. Masubuchi","P. Matricon","H. Matsunaga","T. Matsushita","C. Mattravers","J. Maurer","S. J. Maxfield","A. Mayne","R. Mazini","M. Mazur","L. Mazzaferro","M. Mazzanti","J. Mc Donald","S. P. Mc Kee","A. McCarn","R. L. McCarthy","T. G. McCarthy","N. A. McCubbin","K. W. McFarlane","J. A. Mcfayden","G. Mchedlidze","T. Mclaughlan","S. J. McMahon","R. A. McPherson","A. Meade","J. Mechnich","M. Mechtel","M. Medinnis","R. Meera-Lebbai","T. Meguro","R. Mehdiyev","S. Mehlhase","A. Mehta","K. Meier","B. Meirose","C. Melachrinos","B. R. Mellado Garcia","F. Meloni","L. Mendoza Navas","Z. Meng","A. Mengarelli","S. Menke","E. Meoni","K. M. Mercurio","P. Mermod","L. Merola","C. Meroni","F. S. Merritt","H. Merritt","A. Messina","J. Metcalfe","A. S. Mete","C. Meyer","C. Meyer","J-P. Meyer","J. Meyer","J. Meyer","T. C. Meyer","J. Miao","S. Michal","L. Micu","R. P. Middleton","S. Migas","L. Mijović","G. Mikenberg","M. Mikestikova","M. Mikuž","D. W. Miller","R. J. Miller","W. J. Mills","C. Mills","A. Milov","D. A. Milstead","D. Milstein","A. A. Minaenko","M. Miñano Moya","I. A. Minashvili","A. I. Mincer","B. Mindur","M. Mineev","Y. Ming","L. M. Mir","G. Mirabelli","J. Mitrevski","V. A. Mitsou","S. Mitsui","P. S. Miyagawa","J. U. Mjörnmark","T. Moa","V. Moeller","K. Mönig","N. Möser","S. Mohapatra","W. Mohr","R. Moles-Valls","A. Molfetas","J. Monk","E. Monnier","J. Montejo Berlingen","F. Monticelli","S. Monzani","R. W. Moore","G. F. Moorhead","C. Mora Herrera","A. Moraes","N. Morange","J. Morel","G. Morello","D. Moreno","M. Moreno Llácer","P. Morettini","M. Morgenstern","M. Morii","A. K. Morley","G. Mornacchi","J. D. Morris","L. Morvaj","H. G. Moser","M. Mosidze","J. Moss","R. Mount","E. Mountricha","S. V. Mouraviev","E. J. W. Moyse","F. Mueller","J. Mueller","K. Mueller","T. A. Müller","T. Mueller","D. Muenstermann","Y. Munwes","W. J. Murray","I. Mussche","E. Musto","A. G. Myagkov","M. Myska","J. Nadal","K. Nagai","R. Nagai","K. Nagano","A. Nagarkar","Y. Nagasaka","M. Nagel","A. M. Nairz","Y. Nakahama","K. Nakamura","T. Nakamura","I. Nakano","G. Nanava","A. Napier","R. Narayan","M. Nash","T. Nattermann","T. Naumann","G. Navarro","H. A. Neal","P. Yu. Nechaeva","T. J. Neep","A. Negri","G. Negri","M. Negrini","S. Nektarijevic","A. Nelson","T. K. Nelson","S. Nemecek","P. Nemethy","A. A. Nepomuceno","M. Nessi","M. S. Neubauer","M. Neumann","A. Neusiedl","R. M. Neves","P. Nevski","F. M. Newcomer","P. R. Newman","V. Nguyen Thi Hong","R. B. Nickerson","R. Nicolaidou","B. Nicquevert","F. Niedercorn","J. Nielsen","N. Nikiforou","A. Nikiforov","V. Nikolaenko","I. Nikolic-Audit","K. Nikolics","K. Nikolopoulos","H. Nilsen","P. Nilsson","Y. Ninomiya","A. Nisati","R. Nisius","T. Nobe","L. Nodulman","M. Nomachi","I. Nomidis","S. Norberg","M. Nordberg","P. R. Norton","J. Novakova","M. Nozaki","L. Nozka","I. M. Nugent","A.-E. Nuncio-Quiroz","G. Nunes Hanninger","T. Nunnemann","E. Nurse","B. J. O’Brien","D. C. O’Neil","V. O’Shea","L. B. Oakes","F. G. Oakham","H. Oberlack","J. Ocariz","A. Ochi","S. Oda","S. Odaka","J. Odier","H. Ogren","A. Oh","S. H. Oh","C. C. Ohm","T. Ohshima","H. Okawa","Y. Okumura","T. Okuyama","A. Olariu","A. G. Olchevski","S. A. Olivares Pino","M. Oliveira","D. Oliveira Damazio","E. Oliver Garcia","D. Olivito","A. Olszewski","J. Olszowska","A. Onofre","P. U. E. Onyisi","C. J. Oram","M. J. Oreglia","Y. Oren","D. Orestano","N. Orlando","I. Orlov","C. Oropeza Barrera","R. S. Orr","B. Osculati","R. Ospanov","C. Osuna","G. Otero y Garzon","J. P. Ottersbach","M. Ouchrif","E. A. Ouellette","F. Ould-Saada","A. Ouraou","Q. Ouyang","A. Ovcharova","M. Owen","S. Owen","V. E. Ozcan","N. Ozturk","A. Pacheco Pages","C. Padilla Aranda","S. Pagan Griso","E. Paganis","C. Pahl","F. Paige","P. Pais","K. Pajchel","G. Palacino","C. P. Paleari","S. Palestini","D. Pallin","A. Palma","J. D. Palmer","Y. B. Pan","E. Panagiotopoulou","P. Pani","N. Panikashvili","S. Panitkin","D. Pantea","A. Papadelis","Th. D. Papadopoulou","A. Paramonov","D. Paredes Hernandez","W. Park","M. A. Parker","F. Parodi","J. A. Parsons","U. Parzefall","S. Pashapour","E. Pasqualucci","S. Passaggio","A. Passeri","F. Pastore","Fr. Pastore","G. Pásztor","S. Pataraia","N. Patel","J. R. Pater","S. Patricelli","T. Pauly","M. Pecsy","S. Pedraza Lopez","M. I. Pedraza Morales","S. V. Peleganchuk","D. Pelikan","H. Peng","B. Penning","A. Penson","J. Penwell","M. Perantoni","K. Perez","T. Perez Cavalcanti","E. Perez Codina","M. T. Pérez García-Estañ","V. Perez Reale","L. Perini","H. Pernegger","R. Perrino","P. Perrodo","V. D. Peshekhonov","K. Peters","B. A. Petersen","J. Petersen","T. C. Petersen","E. Petit","A. Petridis","C. Petridou","E. Petrolo","F. Petrucci","D. Petschull","M. Petteni","R. Pezoa","A. Phan","P. W. Phillips","G. Piacquadio","A. Picazio","E. Piccaro","M. Piccinini","S. M. Piec","R. Piegaia","D. T. Pignotti","J. E. Pilcher","A. D. Pilkington","J. Pina","M. Pinamonti","A. Pinder","J. L. Pinfold","B. Pinto","C. Pizio","M. Plamondon","M.-A. Pleier","E. Plotnikova","A. Poblaguev","S. Poddar","F. Podlyski","L. Poggioli","D. Pohl","M. Pohl","G. Polesello","A. Policicchio","A. Polini","J. Poll","V. Polychronakos","D. Pomeroy","K. Pommès","L. Pontecorvo","B. G. Pope","G. A. Popeneciu","D. S. Popovic","A. Poppleton","X. Portell Bueso","G. E. Pospelov","S. Pospisil","I. N. Potrap","C. J. Potter","C. T. Potter","G. Poulard","J. Poveda","V. Pozdnyakov","R. Prabhu","P. Pralavorio","A. Pranko","S. Prasad","R. Pravahan","S. Prell","K. Pretzl","D. Price","J. Price","L. E. Price","D. Prieur","M. Primavera","K. Prokofiev","F. Prokoshin","S. Protopopescu","J. Proudfoot","X. Prudent","M. Przybycien","H. Przysiezniak","S. Psoroulas","E. Ptacek","E. Pueschel","J. Purdham","M. Purohit","P. Puzo","Y. Pylypchenko","J. Qian","A. Quadt","D. R. Quarrie","W. B. Quayle","F. Quinonez","M. Raas","V. Radeka","V. Radescu","P. Radloff","T. Rador","F. Ragusa","G. Rahal","A. M. Rahimi","D. Rahm","S. Rajagopalan","M. Rammensee","M. Rammes","A. S. Randle-Conde","K. Randrianarivony","F. Rauscher","T. C. Rave","M. Raymond","A. L. Read","D. M. Rebuzzi","A. Redelbach","G. Redlinger","R. Reece","K. Reeves","E. Reinherz-Aronis","A. Reinsch","I. Reisinger","C. Rembser","Z. L. Ren","A. Renaud","M. Rescigno","S. Resconi","B. Resende","P. Reznicek","R. Rezvani","R. Richter","E. Richter-Was","M. Ridel","M. Rijpstra","M. Rijssenbeek","A. Rimoldi","L. Rinaldi","R. R. Rios","I. Riu","G. Rivoltella","F. Rizatdinova","E. Rizvi","S. H. Robertson","A. Robichaud-Veronneau","D. Robinson","J. E. M. Robinson","A. Robson","J. G. Rocha de Lima","C. Roda","D. Roda Dos Santos","A. Roe","S. Roe","O. Røhne","S. Rolli","A. Romaniouk","M. Romano","G. Romeo","E. Romero Adam","N. Rompotis","L. Roos","E. Ros","S. Rosati","K. Rosbach","A. Rose","M. Rose","G. A. Rosenbaum","E. I. Rosenberg","P. L. Rosendahl","O. Rosenthal","L. Rosselet","V. Rossetti","E. Rossi","L. P. Rossi","M. Rotaru","I. Roth","J. Rothberg","D. Rousseau","C. R. Royon","A. Rozanov","Y. Rozen","X. Ruan","F. Rubbo","I. Rubinskiy","N. Ruckstuhl","V. I. Rud","C. Rudolph","G. Rudolph","F. Rühr","A. Ruiz-Martinez","L. Rumyantsev","Z. Rurikova","N. A. Rusakovich","J. P. Rutherfoord","C. Ruwiedel","P. Ruzicka","Y. F. Ryabov","M. Rybar","G. Rybkin","N. C. Ryder","A. F. Saavedra","I. Sadeh","H. F-W. Sadrozinski","R. Sadykov","F. Safai Tehrani","H. Sakamoto","G. Salamanna","A. Salamon","M. Saleem","D. Salek","D. Salihagic","A. Salnikov","J. Salt","B. M. Salvachua Ferrando","D. Salvatore","F. Salvatore","A. Salvucci","A. Salzburger","D. Sampsonidis","B. H. Samset","A. Sanchez","V. Sanchez Martinez","H. Sandaker","H. G. Sander","M. P. Sanders","M. Sandhoff","T. Sandoval","C. Sandoval","R. Sandstroem","D. P. C. Sankey","A. Sansoni","C. Santamarina Rios","C. Santoni","R. Santonico","H. Santos","J. G. Saraiva","T. Sarangi","E. Sarkisyan-Grinbaum","F. Sarri","G. Sartisohn","O. Sasaki","Y. Sasaki","N. Sasao","I. Satsounkevitch","G. Sauvage","E. Sauvan","J. B. Sauvan","P. Savard","V. Savinov","D. O. Savu","L. Sawyer","D. H. Saxon","J. Saxon","C. Sbarra","A. Sbrizzi","D. A. Scannicchio","M. Scarcella","J. Schaarschmidt","P. Schacht","D. Schaefer","U. Schäfer","S. Schaepe","S. Schaetzel","A. C. Schaffer","D. Schaile","R. D. Schamberger","A. G. Schamov","V. Scharf","V. A. Schegelsky","D. Scheirich","M. Schernau","M. I. Scherzer","C. Schiavi","J. Schieck","M. Schioppa","S. Schlenker","E. Schmidt","K. Schmieden","C. Schmitt","S. Schmitt","M. Schmitz","B. Schneider","U. Schnoor","A. Schoening","A. L. S. Schorlemmer","M. Schott","D. Schouten","J. Schovancova","M. Schram","C. Schroeder","N. Schroer","M. J. Schultens","J. Schultes","H.-C. Schultz-Coulon","H. Schulz","M. Schumacher","B. A. Schumm","Ph. Schune","C. Schwanenberger","A. Schwartzman","Ph. Schwegler","Ph. Schwemling","R. Schwienhorst","R. Schwierz","J. Schwindling","T. Schwindt","M. Schwoerer","G. Sciolla","W. G. Scott","J. Searcy","G. Sedov","E. Sedykh","S. C. Seidel","A. Seiden","F. Seifert","J. M. Seixas","G. Sekhniaidze","S. J. Sekula","K. E. Selbach","D. M. Seliverstov","B. Sellden","G. Sellers","M. Seman","N. Semprini-Cesari","C. Serfon","L. Serin","L. Serkin","R. Seuster","H. Severini","A. Sfyrla","E. Shabalina","M. Shamim","L. Y. Shan","J. T. Shank","Q. T. Shao","M. Shapiro","P. B. Shatalov","K. Shaw","D. Sherman","P. Sherwood","S. Shimizu","M. Shimojima","T. Shin","M. Shiyakova","A. Shmeleva","M. J. Shochet","D. Short","S. Shrestha","E. Shulga","M. A. Shupe","P. Sicho","A. Sidoti","F. Siegert","Dj. Sijacki","O. Silbert","J. Silva","Y. Silver","D. Silverstein","S. B. Silverstein","V. Simak","O. Simard","Lj. Simic","S. Simion","E. Simioni","B. Simmons","R. Simoniello","M. Simonyan","P. Sinervo","N. B. Sinev","V. Sipica","G. Siragusa","A. Sircar","A. N. Sisakyan","S. Yu. Sivoklokov","J. Sjölin","T. B. Sjursen","L. A. Skinnari","H. P. Skottowe","K. Skovpen","P. Skubic","M. Slater","T. Slavicek","K. Sliwa","V. Smakhtin","B. H. Smart","L. Smestad","S. Yu. Smirnov","Y. Smirnov","L. N. Smirnova","O. Smirnova","B. C. Smith","D. Smith","K. M. Smith","M. Smizanska","K. Smolek","A. A. Snesarev","S. W. Snow","J. Snow","S. Snyder","R. Sobie","J. Sodomka","A. Soffer","C. A. Solans","M. Solar","J. Solc","E. Yu. Soldatov","U. Soldevila","E. Solfaroli Camillocci","A. A. Solodkov","O. V. Solovyanov","V. Solovyev","N. Soni","V. Sopko","B. Sopko","M. Sosebee","R. Soualah","A. Soukharev","S. Spagnolo","F. Spanò","R. Spighi","G. Spigo","R. Spiwoks","M. Spousta","T. Spreitzer","B. Spurlock","R. D. St. Denis","J. Stahlman","R. Stamen","E. Stanecka","R. W. Stanek","C. Stanescu","M. Stanescu-Bellu","M. M. Stanitzki","S. Stapnes","E. A. Starchenko","J. Stark","P. Staroba","P. Starovoitov","R. Staszewski","A. Staude","P. Stavina","G. Steele","P. Steinbach","P. Steinberg","I. Stekl","B. Stelzer","H. J. Stelzer","O. Stelzer-Chilton","H. Stenzel","S. Stern","G. A. Stewart","J. A. Stillings","M. C. Stockton","K. Stoerig","G. Stoicea","S. Stonjek","P. Strachota","A. R. Stradling","A. Straessner","J. Strandberg","S. Strandberg","A. Strandlie","M. Strang","E. Strauss","M. Strauss","P. Strizenec","R. Ströhmer","D. M. Strom","J. A. Strong","R. Stroynowski","J. Strube","B. Stugu","I. Stumer","J. Stupak","P. Sturm","N. A. Styles","D. A. Soh","D. Su","HS. Subramania","A. Succurro","Y. Sugaya","C. Suhr","M. Suk","V. V. Sulin","S. Sultansoy","T. Sumida","X. Sun","J. E. Sundermann","K. Suruliz","G. Susinno","M. R. Sutton","Y. Suzuki","Y. Suzuki","M. Svatos","S. Swedish","I. Sykora","T. Sykora","J. Sánchez","D. Ta","K. Tackmann","A. Taffard","R. Tafirout","N. Taiblum","Y. Takahashi","H. Takai","R. Takashima","H. Takeda","T. Takeshita","Y. Takubo","M. Talby","A. Talyshev","M. C. Tamsett","K. G. Tan","J. Tanaka","R. Tanaka","S. Tanaka","S. Tanaka","A. J. Tanasijczuk","K. Tani","N. Tannoury","S. Tapprogge","D. Tardif","S. Tarem","F. Tarrade","G. F. Tartarelli","P. Tas","M. Tasevsky","E. Tassi","M. Tatarkhanov","Y. Tayalati","C. Taylor","F. E. Taylor","G. N. Taylor","W. Taylor","M. Teinturier","F. A. Teischinger","M. Teixeira Dias Castanheira","P. Teixeira-Dias","K. K. Temming","H. Ten Kate","P. K. Teng","S. Terada","K. Terashi","J. Terron","M. Testa","R. J. Teuscher","J. Therhaag","T. Theveneaux-Pelzer","S. Thoma","J. P. Thomas","E. N. Thompson","P. D. Thompson","P. D. Thompson","A. S. Thompson","L. A. Thomsen","E. Thomson","M. Thomson","W. M. Thong","R. P. Thun","F. Tian","M. J. Tibbetts","T. Tic","V. O. Tikhomirov","Y. A. Tikhonov","S. Timoshenko","P. Tipton","S. Tisserant","T. Todorov","S. Todorova-Nova","B. Toggerson","J. Tojo","S. Tokár","K. Tokushuku","K. Tollefson","M. Tomoto","L. Tompkins","K. Toms","A. Tonoyan","C. Topfel","N. D. Topilin","I. Torchiani","E. Torrence","H. Torres","E. Torró Pastor","J. Toth","F. Touchard","D. R. Tovey","T. Trefzger","L. Tremblet","A. Tricoli","I. M. Trigger","S. Trincaz-Duvoid","M. F. Tripiana","N. Triplett","W. Trischuk","B. Trocmé","C. Troncon","M. Trottier-McDonald","M. Trzebinski","A. Trzupek","C. Tsarouchas","J. C-L. Tseng","M. Tsiakiris","P. V. Tsiareshka","D. Tsionou","G. Tsipolitis","S. Tsiskaridze","V. Tsiskaridze","E. G. Tskhadadze","I. I. Tsukerman","V. Tsulaia","J.-W. Tsung","S. Tsuno","D. Tsybychev","A. Tua","A. Tudorache","V. Tudorache","J. M. Tuggle","M. Turala","D. Turecek","I. Turk Cakir","E. Turlay","R. Turra","P. M. Tuts","A. Tykhonov","M. Tylmad","M. Tyndel","G. Tzanakos","K. Uchida","I. Ueda","R. Ueno","M. Ugland","M. Uhlenbrock","M. Uhrmacher","F. Ukegawa","G. Unal","A. Undrus","G. Unel","Y. Unno","D. Urbaniec","P. Urquijo","G. Usai","M. Uslenghi","L. Vacavant","V. Vacek","B. Vachon","S. Vahsen","J. Valenta","S. Valentinetti","A. Valero","S. Valkar","E. Valladolid Gallego","S. Vallecorsa","J. A. Valls Ferrer","R. Van Berg","P. C. Van Der Deijl","R. van der Geer","H. van der Graaf","R. Van Der Leeuw","E. van der Poel","D. van der Ster","N. van Eldik","P. van Gemmeren","I. van Vulpen","M. Vanadia","W. Vandelli","A. Vaniachine","P. Vankov","F. Vannucci","R. Vari","T. Varol","D. Varouchas","A. Vartapetian","K. E. Varvell","V. I. Vassilakopoulos","F. Vazeille","T. Vazquez Schroeder","G. Vegni","J. J. Veillet","F. Veloso","R. Veness","S. Veneziano","A. Ventura","D. Ventura","M. Venturi","N. Venturi","V. Vercesi","M. Verducci","W. Verkerke","J. C. Vermeulen","A. Vest","M. C. Vetterli","I. Vichou","T. Vickey","O. E. Vickey Boeriu","G. H. A. Viehhauser","S. Viel","M. Villa","M. Villaplana Perez","E. Vilucchi","M. G. Vincter","E. Vinek","V. B. Vinogradov","M. Virchaux","J. Virzi","O. Vitells","M. Viti","I. Vivarelli","F. Vives Vaque","S. Vlachos","D. Vladoiu","M. Vlasak","A. Vogel","P. Vokac","G. Volpi","M. Volpi","G. Volpini","H. von der Schmitt","H. von Radziewski","E. von Toerne","V. Vorobel","V. Vorwerk","M. Vos","R. Voss","T. T. Voss","J. H. Vossebeld","N. Vranjes","M. Vranjes Milosavljevic","V. Vrba","M. Vreeswijk","T. Vu Anh","R. Vuillermet","I. Vukotic","W. Wagner","P. Wagner","H. Wahlen","S. Wahrmund","J. Wakabayashi","S. Walch","J. Walder","R. Walker","W. Walkowiak","R. Wall","P. Waller","B. Walsh","C. Wang","H. Wang","H. Wang","J. Wang","J. Wang","R. Wang","S. M. Wang","T. Wang","A. Warburton","C. P. Ward","M. Warsinsky","A. Washbrook","C. Wasicki","I. Watanabe","P. M. Watkins","A. T. Watson","I. J. Watson","M. F. Watson","G. Watts","S. Watts","A. T. Waugh","B. M. Waugh","M. S. Weber","P. Weber","A. R. Weidberg","P. Weigell","J. Weingarten","C. Weiser","P. S. Wells","T. Wenaus","D. Wendland","Z. Weng","T. Wengler","S. Wenig","N. Wermes","M. Werner","P. Werner","M. Werth","M. Wessels","J. Wetter","C. Weydert","K. Whalen","S. J. Wheeler-Ellis","A. White","M. J. White","S. White","S. R. Whitehead","D. Whiteson","D. Whittington","F. Wicek","D. Wicke","F. J. Wickens","W. Wiedenmann","M. Wielers","P. Wienemann","C. Wiglesworth","L. A. M. Wiik-Fuchs","P. A. Wijeratne","A. Wildauer","M. A. Wildt","I. Wilhelm","H. G. Wilkens","J. Z. Will","E. Williams","H. H. Williams","W. Willis","S. Willocq","J. A. Wilson","M. G. Wilson","A. Wilson","I. Wingerter-Seez","S. Winkelmann","F. Winklmeier","M. Wittgen","S. J. Wollstadt","M. W. Wolter","H. Wolters","W. C. Wong","G. Wooden","B. K. Wosiek","J. Wotschack","M. J. Woudstra","K. W. Wozniak","K. Wraight","M. Wright","B. Wrona","S. L. Wu","X. Wu","Y. Wu","E. Wulf","B. M. Wynne","S. Xella","M. Xiao","S. Xie","C. Xu","D. Xu","B. Yabsley","S. Yacoob","M. Yamada","H. Yamaguchi","A. Yamamoto","K. Yamamoto","S. Yamamoto","T. Yamamura","T. Yamanaka","J. Yamaoka","T. Yamazaki","Y. Yamazaki","Z. Yan","H. Yang","U. K. Yang","Y. Yang","Z. Yang","S. Yanush","L. Yao","Y. Yao","Y. Yasu","G. V. Ybeles Smit","J. Ye","S. Ye","M. Yilmaz","R. Yoosoofmiya","K. Yorita","R. Yoshida","C. Young","C. J. Young","S. Youssef","D. Yu","J. Yu","J. Yu","L. Yuan","A. Yurkewicz","M. Byszewski","B. Zabinski","R. Zaidan","A. M. Zaitsev","Z. Zajacova","L. Zanello","D. Zanzi","A. Zaytsev","C. Zeitnitz","M. Zeman","A. Zemla","C. Zendler","O. Zenin","T. ŽeniÅ¡","Z. Zinonos","S. Zenz","D. Zerwas","G. Zevi della Porta","Z. Zhan","D. Zhang","H. Zhang","J. Zhang","X. Zhang","Z. Zhang","L. Zhao","T. Zhao","Z. Zhao","A. Zhemchugov","J. Zhong","B. Zhou","N. Zhou","Y. Zhou","C. G. Zhu","H. Zhu","J. Zhu","Y. Zhu","X. Zhuang","V. Zhuravlov","D. Zieminska","N. I. Zimin","R. Zimmermann","S. Zimmermann","S. Zimmermann","M. Ziolkowski","R. Zitoun","L. Živković","V. V. Zmouchko","G. Zobernig","A. Zoccoli","M. zur Nedden","V. Zutshi","L. Zwalinski"],"year":"2012","journal":"The European Physical Journal C","publisher":"Springer Nature","subject":"Physics and Astronomy (miscellaneous)","type":"journal-article","sha":"19caae86acd7e35fd26a1c78de9ca404f989ad8d"}
+{"doi":"10.1145/1062745.1062833","title":"The language observatory project (LOP)","authors":["Yoshiki Mikami","Sebastiano Vigna","Pavol Zavarsky","Mohd Zaidi Abd Rozan","Izumi Suzuki","Masayuki Takahashi","Tomohide Maki","Irwan Nizan Ayob","Paolo Boldi","Massimo Santini"],"year":"2005","journal":"Special interest tracks and posters of the 14th international conference on World Wide Web - WWW '05","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"914efcd0fd5e2b6a14bf63bcba1d2b159edd139e"}
+{"doi":"10.1145/1075382.1075387","title":"Analysis of recursive state machines","authors":["Rajeev Alur","Michael Benedikt","Kousha Etessami","Patrice Godefroid","Thomas Reps","Mihalis Yannakakis"],"year":"2005","journal":"ACM Transactions on Programming Languages and Systems","publisher":"Association for Computing Machinery (ACM)","subject":"Software","type":"journal-article","sha":"c33a7f02d85e6e17c8509324f8e95483f9db62bb"}
+{"doi":"10.1145/1117309.1117356","title":"An eye-tracking methodology for characterizing program comprehension processes","authors":["Roman Bednarik","Markku Tukiainen"],"year":"2006","journal":"Proceedings of the 2006 symposium on Eye tracking research & applications - ETRA '06","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"055ac7185a7bf5111321c04c0eafc9a46d746d81"}
+{"doi":"10.1145/1119766.1119769","title":"Image retrieval and perceptual similarity","authors":["Dirk Neumann","Karl R. Gegenfurtner"],"year":"2006","journal":"ACM Transactions on Applied Perception","publisher":"Association for Computing Machinery (ACM)","subject":"Theoretical Computer Science","type":"journal-article","sha":"bbf94799250c4b1f4cfcd8349fb46025ff0d2f3b"}
+{"doi":"10.1145/1140277.1140286","title":"Partially overlapped channels not considered harmful","authors":["Arunesh Mishra","Vivek Shrivastava","Suman Banerjee","William Arbaugh"],"year":"2006","journal":"Proceedings of the joint international conference on Measurement and modeling of computer systems - SIGMETRICS '06/Performance '06","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"df5afbdb6bd25df496a429b1409283531701849d"}
+{"doi":"10.1145/1180639.1180656","title":"Maximum unfolded embedding","authors":["Huan Wang","Shuicheng Yan","Thomas Huang","Xiaoou Tang"],"year":"2006","journal":"Proceedings of the 14th annual ACM international conference on Multimedia - MULTIMEDIA '06","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"ad0cb9c59d923f08848fc34e42b9a1be3002e952"}
+{"doi":"10.1145/1236360.1236369","title":"Robust system multiangulation using subspace methods","authors":["Joshua N. Ash","Lee C. Potter"],"year":"2007","journal":"Proceedings of the 6th international conference on Information processing in sensor networks - IPSN '07","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"534c4d9a189c1c35ff412a1f4d36ef19e08ee2c8"}
+{"doi":"10.1145/136586.136587","title":"The design and implementation of hierarchical software systems with reusable components","authors":["Don Batory","Sean O'Malley"],"year":"1992","journal":"ACM Transactions on Software Engineering and Methodology","publisher":"Association for Computing Machinery (ACM)","subject":"Software","type":"journal-article","sha":"fcc157fdb9a734a51056769b29d23e2cd7cffa2a"}
+{"doi":"10.1145/1401890.1402014","title":"Using predictive analysis to improve invoice-to-cash collection","authors":["Sai Zeng","Prem Melville","Christian A. Lang","Ioana Boier-Martin","Conrad Murphy"],"year":"2008","journal":"Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining - KDD 08","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"790c79966f89615270ca4ab743f8602f110b16a9"}
+{"doi":"10.1145/1450058.1450069","title":"Model-based validation of QoS properties of biomedical sensor networks","authors":["Simon Tschirner","Liang Xuedong","Wang Yi"],"year":"2008","journal":"Proceedings of the 7th ACM international conference on Embedded software - EMSOFT '08","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"9abdc192d07adf4c07b677aac1ba0f71115c054a"}
+{"doi":"10.1145/1486525.1486531","title":"Algorithm 892","authors":["Kristjan Jonasson"],"year":"2009","journal":"ACM Transactions on Mathematical Software","publisher":"Association for Computing Machinery (ACM)","subject":"Software","type":"journal-article","sha":"1dd7a777a3b578f98ca35d30e3bdde65b989cefd"}
+{"doi":"10.1145/1508293.1508296","title":"A lock-free, concurrent, and incremental stack scanning for garbage collectors","authors":["Gabriel Kliot","Erez Petrank","Bjarne Steensgaard"],"year":"2009","journal":"Proceedings of the 2009 ACM SIGPLAN/SIGOPS international conference on Virtual execution environments - VEE '09","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"395f08b05f1c83b68ac7c4c9ddf6bdcb32c4f9f0"}
+{"doi":"10.1145/1860058.1860078","title":"Fundamental tradeoffs in vehicular ad hoc networks","authors":["Mohammad Nekoui","Hossein Pishro-nik"],"year":"2010","journal":"Proceedings of the seventh ACM international workshop on VehiculAr InterNETworking - VANET '10","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"f6e140e4653c7f4867afb266685d552981668fc3"}
+{"doi":"10.1145/198429.198431","title":"Algorithm 736; hyperelliptic integrals and the surface measure of ellipsoids","authors":["Charles F. Dunkl","Donald E. Ramirez"],"year":"1994","journal":"ACM Transactions on Mathematical Software","publisher":"Association for Computing Machinery (ACM)","subject":"Software","type":"journal-article","sha":"24417e305419177dc24ca4aff2075ede098e5a30"}
+{"doi":"10.1145/1998196.1998203","title":"Metric graph reconstruction from noisy data","authors":["Mridul Aanjaneya","Frederic Chazal","Daniel Chen","Marc Glisse","Leonidas J. Guibas","Dmitriy Morozov"],"year":"2011","journal":"Proceedings of the 27th annual ACM symposium on Computational geometry - SoCG '11","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"84f74eaa6589d4f197bb57269933a5812969f163"}
+{"doi":"10.1145/2185632.2185676","title":"Computing bounded reach sets from sampled simulation traces","authors":["Zhenqi Huang","Sayan Mitra"],"year":"2012","journal":"Proceedings of the 15th ACM international conference on Hybrid Systems: Computation and Control - HSCC '12","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"eb6a8eb2f133cd5e73dd7dcefe1cd2e275d1b9b6"}
+{"doi":"10.1145/223904.223941","title":"Learning to write together using groupware","authors":["Alex Mitchell","Ilona Posner","Ronald Baecker"],"year":"1995","journal":"Proceedings of the SIGCHI conference on Human factors in computing systems - CHI '95","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"cf6c3f2d7722735c6f401f54d26932c9dc6bd8e2"}
+{"doi":"10.1145/235833.236050","title":"Using GOMS for user interface design and evaluation: which technique?","authors":["Bonnie E. John","David E. Kieras"],"year":"1996","journal":"ACM Transactions on Computer-Human Interaction","publisher":"Association for Computing Machinery (ACM)","subject":"Human-Computer Interaction","type":"journal-article","sha":"a0f4aaabb8c159b11b98418bb704d612cb601d5a"}
+{"doi":"10.1145/2508363.2508380","title":"Reconstructing detailed dynamic face geometry from monocular video","authors":["Pablo Garrido","Levi Valgaert","Chenglei Wu","Christian Theobalt"],"year":"2013","journal":"ACM Transactions on Graphics","publisher":"Association for Computing Machinery (ACM)","subject":"Computer Graphics and Computer-Aided Design","type":"journal-article","sha":"976208323fd68f403e3d7c66f66d39f8788fe24c"}
+{"doi":"10.1145/2611778","title":"A Survey and Classification of Storage Deduplication Systems","authors":["João Paulo","José Pereira"],"year":"2014","journal":"ACM Computing Surveys","publisher":"Association for Computing Machinery (ACM)","subject":"Theoretical Computer Science","type":"journal-article","sha":"46a574413123beb2ba0572c563e1a4883baec997"}
+{"doi":"10.1145/2629628","title":"Extending UML/MARTE to Support Discrete Controller Synthesis, Application to Reconfigurable Systems-on-Chip Modeling","authors":["Sébastien Guillet","Florent de Lamotte","Nicolas le Griguer","Éric Rutten","Guy Gogniat","Jean-Philippe Diguet"],"year":"2014","journal":"ACM Transactions on Reconfigurable Technology and Systems","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"journal-article","sha":"d7e433750b0bb782eae277bca97db3ab74c551ea"}
+{"doi":"10.1145/340916.340924","title":"Art-based rendering with continuous levels of detail","authors":["Lee Markosian","Barbara J. Meier","Michael A. Kowalski","Loring S. Holden","J. D. Northrup","John F. Hughes"],"year":"2000","journal":"Proceedings of the first international symposium on Non-photorealistic animation and rendering - NPAR '00","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"5c8e4b7272d8fa191fbf753aeef3d5ea90938193"}
+{"doi":"10.1145/383962.384001","title":"Competitive concurrent distributed queuing","authors":["Maurice Herlihy","Srikanta Tirthapura","Roger Wattenhofer"],"year":"2001","journal":"Proceedings of the twentieth annual ACM symposium on Principles of distributed computing - PODC '01","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"e1f0af54b1ddb555a06f1609ed6bee816e48c247"}
+{"doi":"10.1145/42404.42410","title":"Dynamic hash tables","authors":["Per-Ake Larson"],"year":"1988","journal":"Communications of the ACM","publisher":"Association for Computing Machinery (ACM)","subject":"Computer Science(all)","type":"journal-article","sha":"21ea85d0f30d9eb9c370b2e2ac38f81e766215b4"}
+{"doi":"10.1145/586081.586082","title":"Appendices A--D: A scalable method for deductive generalization in the spreadsheet paradigm","authors":["Margaret Burnett","Sherry Yang","Jay Summet"],"year":"2002","journal":"ACM Transactions on Computer-Human Interaction","publisher":"Association for Computing Machinery (ACM)","subject":"Human-Computer Interaction","type":"journal-article","sha":"dcfc20eeef5430d890d076672d2d4aa477918687"}
+{"doi":"10.1145/860722.860752","title":"Constructing optimal policies for agents with constrained architectures","authors":["Dmitri A. Dolgov","Edmund H. Durfee"],"year":"2003","journal":"Proceedings of the second international joint conference on Autonomous agents and multiagent systems - AAMAS '03","publisher":"Association for Computing Machinery (ACM)","subject":"","type":"proceedings-article","sha":"984ee076390652b3d157af8f12b639720d60e721"}
+{"doi":"10.1152/advan.00069.2009","title":"Experience with a theme-based integrated renal module for a second-year MBBS class","authors":["R. Shafi","K. H. M. Quadri","W. Ahmed","S. N. Mahmud","M. Iqbal"],"year":"2010","journal":"AJP: Advances in Physiology Education","publisher":"American Physiological Society","subject":"Physiology","type":"journal-article","sha":"ca480f120dafad3e18c8d71120708e8b1b44a090"}
+{"doi":"10.1152/ajpheart.00147.2002","title":"Prologue: nonclassical modalities of myocardial preconditioning","authors":["Garrett J. Gross","David C. Warltier"],"year":"2002","journal":"American Journal of Physiology - Heart and Circulatory Physiology","publisher":"American Physiological Society","subject":"Physiology (medical)","type":"journal-article","sha":"1aaac38af6d1b816bafe69820b1cd9b7036de6d4"}
+{"doi":"10.1155/2007/49389","title":"Channel Equalization in Filter Bank Based Multicarrier Modulation for Wireless Communications","authors":["Tero Ihalainen","Tobias Hidalgo Stitz","Mika Rinne","Markku Renfors"],"year":"2007","journal":"EURASIP Journal on Advances in Signal Processing","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"69d6a74679be6259b2c0597b23a198d6d609ffa3"}
+{"doi":"10.1155/2010/615623","title":"Approximate Minimum Bit Error Rate Equalization for Fading Channels","authors":["Lorant Kovacs","Janos Levendovszky","Andras Olah","Gergely Treplan"],"year":"2010","journal":"EURASIP Journal on Advances in Signal Processing","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"7ef4097546b367c09ac22280987c969e5ee97f7e"}
+{"doi":"10.1155/s0161171291000145","title":"On Galois projective group rings","authors":["George Szeto","Linjun Ma"],"year":"1991","journal":"International Journal of Mathematics and Mathematical Sciences","publisher":"Hindawi Publishing Corporation","subject":"Mathematics (miscellaneous)","type":"journal-article","sha":"c47ed77d410cae2cb7eee986088679b40edcec0a"}
+{"doi":"10.1155/s0161171292000528","title":"Surface water waves due to an oscillatory wavemaker in the presence of surface tension","authors":["B. N. Mandal","S. Banerjea"],"year":"1992","journal":"International Journal of Mathematics and Mathematical Sciences","publisher":"Hindawi Publishing Corporation","subject":"Mathematics (miscellaneous)","type":"journal-article","sha":"6bc6721ef9b0a992add36ae549bce8f67e8001ec"}
+{"doi":"10.1158/0008-5472.can-13-2702","title":"Activin Upregulation by NF- B Is Required to Maintain Mesenchymal Features of Cancer Stem-like Cells in Non-Small Cell Lung Cancer","authors":["J. J. Wamsley","M. Kumar","D. F. Allison","S. H. Clift","C. M. Holzknecht","S. J. Szymura","S. A. Hoang","X. Xu","C. A. Moskaluk","D. R. Jones","S. Bekiranov","M. W. Mayo"],"year":"2015","journal":"Cancer Research","publisher":"American Association for Cancer Research (AACR)","subject":"Cancer Research","type":"journal-article","sha":"6292cf33eb729fd662a5f9d15094fcb95817dd3c"}
+{"doi":"10.1158/1055-9965.epi-07-0168","title":"Flavonoids and the Risk of Oral and Pharyngeal Cancer: A Case-Control Study from Italy","authors":["M. Rossi","W. Garavello","R. Talamini","E. Negri","C. Bosetti","L. Dal Maso","P. Lagiou","A. Tavani","J. Polesel","L. Barzan","V. Ramazzotti","S. Franceschi","C. La Vecchia"],"year":"2007","journal":"Cancer Epidemiology Biomarkers &amp; Prevention","publisher":"American Association for Cancer Research (AACR)","subject":"Epidemiology","type":"journal-article","sha":"fc42da454ba17cccd362c72d33d70d7271d32e2e"}
+{"doi":"10.1158/1535-7163.mct-08-0485","title":"CXCR3 expression is associated with poor survival in breast cancer and promotes metastasis in a murine model","authors":["X. Ma","K. Norsworthy","N. Kundu","W. H. Rodgers","P. A. Gimotty","O. Goloubeva","M. Lipsky","Y. Li","D. Holt","A. Fulton"],"year":"null","journal":"Molecular Cancer Therapeutics","publisher":"American Association for Cancer Research (AACR)","subject":"Cancer Research","type":"journal-article","sha":"594bf256ba94c22235c739a758b4cee98df2050d"}
+{"doi":"10.1158/2326-6066.cir-13-0059-t","title":"Combining Oncolytic HSV-1 with Immunogenic Cell Death-Inducing Drug Mitoxantrone Breaks Cancer Immune Tolerance and Improves Therapeutic Efficacy","authors":["S. T. Workenhe","J. G. Pol","B. D. Lichty","D. T. Cummings","K. L. Mossman"],"year":"2013","journal":"Cancer Immunology Research","publisher":"American Association for Cancer Research (AACR)","subject":"","type":"journal-article","sha":"753f4c2b0c450f7c474449c4a80a5a5845402db1"}
+{"doi":"10.1158/ajc.1934.878","title":"A Survey of Cancer Cases in the Hospitals of Bridgeport, Conn., 1928-1932 Inclusive","authors":["W. F. Wild"],"year":"1934","journal":"The American Journal of Cancer","publisher":"American Association for Cancer Research (AACR)","subject":"","type":"journal-article","sha":"712b2d5407f86f6e4c58f75c44fb628d213bab8d"}
+{"doi":"10.11606/issn.2238-6149.v26i1p146-152","title":"Modificações no índice de massa corporal em mulheres idosas após um programa de reabilitação física","authors":["Jorge Luiz de Brito-Gomes","Raphael José Perrier-Melo","Ademar Lucena Filho","Marcos André Moura dos Santos","Manoel Da Cunha Costa","Fernando José de Sá Pereira Guimarães"],"year":"null","journal":"Revista de Terapia Ocupacional da Universidade de São Paulo","publisher":"Universidade de Sao Paulo Sistema Integrado de Bibliotecas - SIBiUSP","subject":"","type":"journal-article","sha":"9afe783bb2f56fa81f6d92dbdcdf0c04d07a16b1"}
+{"doi":"10.1161/01.hyp.0000085331.22169.3f","title":"Attenuated Responses to Angiotensin II in Follitropin Receptor Knockout Mice, a Model of Menopause-Associated Hypertension","authors":["D. Javeshghani","R. M. Touyz","M. R. Sairam","A. Virdis","M. F. Neves","E. L. Schiffrin"],"year":"2003","journal":"Hypertension","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Internal Medicine","type":"journal-article","sha":"6322ad9cbca7d007cddbfb5ef00f55c2407510db"}
+{"doi":"10.1161/circinterventions.109.877522","title":"Polymer-Free Biolimus A9-Coated Stent Demonstrates More Sustained Intimal Inhibition, Improved Healing, and Reduced Inflammation Compared With a Polymer-Coated Sirolimus-Eluting Cypher Stent in a Porcine Model","authors":["N. Tada","R. Virmani","G. Grant","L. Bartlett","A. Black","C. Clavijo","U. Christians","R. Betts","D. Savage","S. H. Su","J. Shulze","S. Kar"],"year":"2010","journal":"Circulation: Cardiovascular Interventions","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Cardiology and Cardiovascular Medicine","type":"journal-article","sha":"4e910ab15f305696e77817254d3e28665d1d8822"}
+{"doi":"10.1161/circulationaha.111.070045","title":"Prognostic Values of Clockwise and Counterclockwise Rotation for Cardiovascular Mortality in Japanese Subjects: A 24-Year Follow-Up of the National Integrated Project for Prospective Observation of Noncommunicable Disease and Its Trends in the Aged, 1980-2004 (NIPPON DATA80)","authors":["Y. Nakamura","T. Okamura","A. Higashiyama","M. Watanabe","A. Kadota","T. Ohkubo","K. Miura","F. Kasagi","K. Kodama","A. Okayama","H. Ueshima"," "],"year":"2012","journal":"Circulation","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Physiology (medical)","type":"journal-article","sha":"9dbab3f1a53a6766c0702e7c46decba8c3fbe8b4"}
+{"doi":"10.1161/hypertensionaha.108.114835","title":"Effects of Habitual Alcohol Intake on Ambulatory Blood Pressure, Heart Rate, and Its Variability Among Japanese Men","authors":["T. Ohira","T. Tanigawa","M. Tabata","H. Imano","A. Kitamura","M. Kiyama","S. Sato","T. Okamura","R. Cui","K. A. Koike","T. Shimamoto","H. Iso"],"year":"2009","journal":"Hypertension","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Internal Medicine","type":"journal-article","sha":"61c5b1cd86740b82959b395328d4579466d7a9f6"}
+{"doi":"10.1161/jaha.114.001210","title":"Platelet Endothelial Cell Adhesion Molecule-1 Mediates Endothelial-Cardiomyocyte Communication and Regulates Cardiac Function","authors":["M. E. McCormick","C. Collins","C. A. Makarewich","Z. Chen","M. Rojas","M. S. Willis","S. R. Houser","E. Tzima"],"year":"2015","journal":"Journal of the American Heart Association","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"","type":"journal-article","sha":"99eb13c577158707929f26be627c28a05ecb03fc"}
+{"doi":"10.1161/strokeaha.113.001601","title":"Letter by Chen Regarding Article, \"The Impact of Green Tea and Coffee Consumption on the Reduced Risk of Stroke Incidence in Japanese Population: The Japan Public Health Center-Based Study Cohort\"","authors":["R. Chen"],"year":"2013","journal":"Stroke","publisher":"Ovid Technologies (Wolters Kluwer Health)","subject":"Medicine(all)","type":"journal-article","sha":"bcf0bc7ee026de32b15d8dcd46ce3af5467a2864"}
+{"doi":"10.11648/j.ss.20130204.12","title":"The Connection between the Depopulation of Localities and Passenger Rail Services in the Province of Buenos Aires in Argentina between 1960 and 2009","authors":["Juan Manuel Diez-Tetamanti"],"year":"2013","journal":"Social Sciences","publisher":"Science Publishing Group","subject":"","type":"journal-article","sha":"98592cd4965ffcd353a94f83f1f8c23cf56a4c01"}
+{"doi":"10.1172/jci117564","title":"Inhibition of murine nephritogenic effector T cells by a clone-specific suppressor factor.","authors":["C M Meyers","C J Kelly"],"year":"1994","journal":"Journal of Clinical Investigation","publisher":"American Society for Clinical Investigation","subject":"Medicine(all)","type":"journal-article","sha":"feb3a54218e15a46899e217be5087b34e5b009d6"}
+{"doi":"10.1175/1520-0450(1999)038<1346:efnssa>2.0.co;2","title":"Evaporation from Nonvegetated Surfaces: Surface Aridity Methods and Passive Microwave Remote Sensing","authors":["Anthony T. Cahill","Marc B. Parlange","Thomas J. Jackson","Peggy O’Neill","T. J. Schmugge"],"year":"1999","journal":"Journal of Applied Meteorology","publisher":"American Meteorological Society","subject":"Atmospheric Science","type":"journal-article","sha":"28e234631d3935d9567401a10aef5be743435699"}
+{"doi":"10.1175/2008mwr2582.1","title":"Large-Eddy Simulations of a Drizzling, Stratocumulus-Topped Marine Boundary Layer","authors":["Andrew S. Ackerman","Margreet C. vanZanten","Bjorn Stevens","Verica Savic-Jovcic","Christopher S. Bretherton","Andreas Chlond","Jean-Christophe Golaz","Hongli Jiang","Marat Khairoutdinov","Steven K. Krueger","David C. Lewellen","Adrian Lock","Chin-Hoh Moeng","Kozo Nakamura","Markus D. Petters","Jefferson R. Snider","Sonja Weinbrecht","Mike Zulauf"],"year":"2009","journal":"Monthly Weather Review","publisher":"American Meteorological Society","subject":"Atmospheric Science","type":"journal-article","sha":"5599b16f786f7966233f70dd603957dba09c326b"}
+{"doi":"10.1176/ajp.2006.163.4.682","title":"Natural History of Male Psychological Health, XV: Retirement Satisfaction","authors":["George E. Vaillant","Ana C. DiRago","Ken Mukamal"],"year":"2006","journal":"American Journal of Psychiatry","publisher":"American Psychiatric Publishing","subject":"Medicine(all)","type":"journal-article","sha":"8eb45e31369a1e2642f9b5d4259318ba571e694d"}
+{"doi":"10.1177/0146167283093008","title":"Expectancies about Control Over Health","authors":["Kenneth A. Wallston","Roberta A. Smith","Joan E. King","Patricia R. Forsberg","Barbara Strudler Wallston","Vivian Tong Nagy"],"year":"1983","journal":"Personality and Social Psychology Bulletin","publisher":"SAGE Publications","subject":"Social Psychology","type":"journal-article","sha":"6c078a94336c329f39493e83f27a74f8df873882"}
+{"doi":"10.1177/027836499101000606","title":"Identifying the Independent Inertial Parameter Space of Robot Manipulators","authors":["S.-Y. Sheu","M. W. Walker"],"year":"1991","journal":"The International Journal of Robotics Research","publisher":"SAGE Publications","subject":"Mechanical Engineering","type":"journal-article","sha":"1c22f804f7ec2af6799260f79195cdfb690991e6"}
+{"doi":"10.1177/1470412906070518","title":"Movement before Cinematography: The High-Speed Qualities of Sentiment","authors":["Jimena Canales"],"year":"2006","journal":"Journal of Visual Culture","publisher":"SAGE Publications","subject":"Communication","type":"journal-article","sha":"8f9325389c0d7d76276b982a01f80d75dde44dfa"}
+{"doi":"10.1177/239700220501900418","title":"Risikomanagement mit leistungsabha ngiger Vergu tung: Einfluss variabler Entgeltformen auf das Kreditvergabeverhalten von Banken","authors":["I. A. Falkenstein"],"year":"2005","journal":"German Journal of Human Resource Management: Zeitschrift f&#252;r Personalforschung","publisher":"SAGE Publications","subject":"","type":"journal-article","sha":"eeb8e6e99a475e69c1c510a202174f1f64631b56"}
+{"doi":"10.1183/09031936.00011010","title":"Iloprost-induced thrombocytopenia: a case proven by rechallenge","authors":["A. B. Taegtmeyer","C. Zettler","M. Siegemund","D. A. Tsakiris","A. E. Ratz Bravo","H. Pargger","S. Kraehenbuehl","M. Haschke"],"year":"2010","journal":"European Respiratory Journal","publisher":"European Respiratory Society (ERS)","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"79cc73d4a6d0420ffebaa11149a97f53b3c6838b"}
+{"doi":"10.1183/09031936.02.00264202","title":"The correlation of emphysema or airway obstruction with the risk of lung cancer: a matched case-controlled study","authors":["K. Kishi","J.W. Gurney","D.R. Schroeder","P.D. Scanlon","S.J. Swensen","J.R. Jett"],"year":"2002","journal":"European Respiratory Journal","publisher":"European Respiratory Society (ERS)","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"0f6f47dbeb6bef25f0281c1d8d04d2dbccc507c2"}
+{"doi":"10.1183/09031936.05.00082205","title":"Clinical utility of CT in children with persistent focal chest abnormality","authors":["S. Montella"],"year":"2005","journal":"European Respiratory Journal","publisher":"European Respiratory Society (ERS)","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"ea6d6d3f190995c2d9813a4cfb47becfc9f5de41"}
+{"doi":"10.1183/09031936.98.12051067","title":"Neutrophil chemokines in bronchoalveolar lavage fluid and leukocyte-conditioned medium from nonsmokers and smokers","authors":["D. Morrison","R.M. Strieter","S.C. Donnelly","M.D. Burdick","S.L. Kunkel","W. MacNee"],"year":"1998","journal":"European Respiratory Journal","publisher":"European Respiratory Society (ERS)","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"f3c8fda8d2a21145380f37555b5f91c425aff08a"}
+{"doi":"10.1186/1129-2377-14-77","title":"Monozygotic twin sisters discordant for familial hemiplegic migraine","authors":["José Barros","Rui Barreto","Ana Brandão","Joana Domingos","Joana Damásio","Cristina Ramos","Carolina Lemos","Jorge Sequeiros","Isabel Alonso","José Pereira-Monteiro"],"year":"2013","journal":"The Journal of Headache and Pain","publisher":"Springer Nature","subject":"Anesthesiology and Pain Medicine","type":"journal-article","sha":"952059f84aa04b11f2ce71192a959af8f273fc78"}
+{"doi":"10.1186/1465-9921-6-127","title":"Pulmonary function and fuel use: A population survey","authors":["Asim Saha","N Mohan Rao","PK Kulkarni","PK Majumdar","HN Saiyed"],"year":"2005","journal":"Respiratory Research","publisher":"Springer Nature","subject":"Pulmonary and Respiratory Medicine","type":"journal-article","sha":"ce3f33f4a8183294ea1b4bc213961b1bdd0ca7b0"}
+{"doi":"10.1186/1471-2105-15-s11-s8","title":"Divergence of protein-coding capacity and regulation in the Bacillus cereus sensu lato group","authors":["Inimary T Toby","Jonah Widmer","David W Dyer"],"year":"2014","journal":"BMC Bioinformatics","publisher":"Springer Nature","subject":"Biochemistry","type":"journal-article","sha":"65efb268ab86d5786a9667fac7f4cdb11e6956d5"}
+{"doi":"10.1186/1471-2458-13-1176","title":"Attentional bias retraining in cigarette smokers attempting smoking cessation (ARTS): Study protocol for a double blind randomised controlled trial","authors":["Rachna Begh","Marcus R Munafò","Saul Shiffman","Stuart G Ferguson","Linda Nichols","Mohammed A Mohammed","Roger L Holder","Stephen Sutton","Paul Aveyard"],"year":"2013","journal":"BMC Public Health","publisher":"Springer Nature","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"949111c18bec095c2e707c36088463262a9ce44c"}
+{"doi":"10.1186/1471-2474-14-46","title":"Effectiveness of a cognitive-behavioral group intervention for knee osteoarthritis pain: protocol of a randomized controlled trial","authors":["Eeva-Eerika Helminen","Sanna H Sinikallio","Anna L Valjakka","Rauni H Väisänen-Rouvali","Jari P Arokoski"],"year":"2013","journal":"BMC Musculoskeletal Disorders","publisher":"Springer Nature","subject":"Orthopedics and Sports Medicine","type":"journal-article","sha":"8ba5622d58b0807dde1e943483f2cf5720fd1ea7"}
+{"doi":"10.1186/1472-6920-14-75","title":"Constructing core competency indicators for clinical teachers in Taiwan: a qualitative analysis and an analytic hierarchy process","authors":["Ai-Tzu Li","Jou-Wei Lin"],"year":"2014","journal":"BMC Medical Education","publisher":"Springer Nature","subject":"Education","type":"journal-article","sha":"6f0db1e4f750b204cab2722c0580c88a2940c6f4"}
+{"doi":"10.1186/1475-2875-11-177","title":"The effects of serum lipids on the in vitro activity of lumefantrine and atovaquone against Plasmodium falciparum","authors":["Kesinee Chotivanich","Mathirut Mungthin","Ronnatrai Ruengweerayuth","Rachanee Udomsangpetch","Arjen M Dondorp","Pratap Singhasivanon","Sasithon Pukrittayakamee","Nicholas J White"],"year":"2012","journal":"Malaria Journal","publisher":"Springer Nature","subject":"Parasitology","type":"journal-article","sha":"5516d3787bb7276419485eafaeacb1f90e7cb92f"}
+{"doi":"10.1186/1479-7364-4-1-43","title":"Evolutionary divergence and functions of the ADAM and ADAMTS gene families","authors":["Chad N Brocker","Vasilis Vasiliou","Daniel W Nebert"],"year":"2009","journal":"Human Genomics","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"c6052041b29ab547dc3a2d05f1a1202769b5ce9b"}
+{"doi":"10.1186/1755-7682-3-16","title":"Fasting hyperglycemia upon hospital admission is associated with higher pneumonia complication rates among the elderly","authors":["Mario R Castellanos","Anita Szerszen","Chadi Saifan","Irina Zigelboym","Georges Khoueiry","Nidal Abi Rafeh","Robert V Wetz","Morton Kleiner","Nelly Aoun","Kera F Weiserbs","Theodore Maniatis","Jeffrey Rothman"],"year":"2010","journal":"International Archives of Medicine","publisher":"Springer Nature","subject":"Medicine(all)","type":"journal-article","sha":"c204f24cd4e7d23778804b05cba2649a6bc253f3"}
+{"doi":"10.1186/1757-2215-3-14","title":"Increased androgen receptor expression in serous carcinoma of the ovary is associated with an improved survival","authors":["Björn Nodin","Nooreldin Zendehrokh","Jenny Brändstedt","Elise Nilsson","Jonas Manjer","Donal J Brennan","Karin Jirström"],"year":"2010","journal":"Journal of Ovarian Research","publisher":"Springer Nature","subject":"Obstetrics and Gynaecology","type":"journal-article","sha":"c72d0905fd87b615950533056c0600b66e84919d"}
+{"doi":"10.1186/1758-2652-14-60","title":"\"Othering\" the health worker: self-stigmatization of HIV/AIDS care among health workers in Swaziland","authors":["Daniel H de Vries","Shannon Galvin","Masitsela Mhlanga","Brian Cindzi","Thabsile Dlamini"],"year":"2011","journal":"Journal of the International AIDS Society","publisher":"Springer Nature","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"8d99a5f0b3a0f1d8f09ca6c18158d50fb5fc3fa3"}
+{"doi":"10.1186/cc14146","title":"Neutrophil to lymphocyte count ratio performs better than procalcitonin as a biomarker for bacteremia and severe sepsis in the emergency department","authors":["LL Ljungström","D Karlsson","A Pernestig","R Andersson","G Jacobsson"],"year":"2015","journal":"Critical Care","publisher":"Springer Nature","subject":"Critical Care and Intensive Care Medicine","type":"journal-article","sha":"70fd708f7d6dd7c4631d4c8405ae2d2c8662b9dc"}
+{"doi":"10.1186/s12864-015-1929-y","title":"Transfer RNA detection by small RNA deep sequencing and disease association with myelodysplastic syndromes","authors":["Yan Guo","Amma Bosompem","Sanjay Mohan","Begum Erdogan","Fei Ye","Kasey C. Vickers","Quanhu Sheng","Shilin Zhao","Chung-I Li","Pei-Fang Su","Madan Jagasia","Stephen A. Strickland","Elizabeth A. Griffiths","Annette S. Kim"],"year":"2015","journal":"BMC Genomics","publisher":"Springer Nature","subject":"Biotechnology","type":"journal-article","sha":"1ef9e53c100e5560f68a9b52ce248b4cb92f7f29"}
+{"doi":"10.1186/s12872-015-0156-4","title":"Exercise training restores the cardiac microRNA-1 and −214 levels regulating Ca2+ handling after myocardial infarction","authors":["Stéphano Freitas Soares Melo","Valério Garrone Barauna","Vander José Neves","Tiago Fernandes","Lucienne da Silva Lara","Diego Robles Mazzotti","Edilamar Menezes Oliveira"],"year":"2015","journal":"BMC Cardiovascular Disorders","publisher":"Springer Nature","subject":"Cardiology and Cardiovascular Medicine","type":"journal-article","sha":"283d6dffbaba150f637d391f43f91c9b29e47ed1"}
+{"doi":"10.1186/s12896-015-0192-2","title":"Identification and characterization of laccase-type multicopper oxidases involved in dye-decolorization by the fungus Leptosphaerulina sp.","authors":["Ledys S. Copete","Xiomara Chanagá","Jorge Barriuso","María F. López-Lucendo","María J. Martínez","Susana Camarero"],"year":"2015","journal":"BMC Biotechnology","publisher":"Springer Nature","subject":"Biotechnology","type":"journal-article","sha":"68ac5a3126ffa0815185431d54a67c28e7a73a6f"}
+{"doi":"10.1186/s12902-015-0061-y","title":"Role of enteral nutrition in nonthyroidal illness syndrome: a retrospective observational study","authors":["Ranran Li","Jianan Ren","Qin Wu","Gefei Wang","Xiuwen Wu","Jun Chen","Guanwei Li","Zhiwu Hong","Huajian Ren","Yunzhao Zhao","Jieshou Li"],"year":"2015","journal":"BMC Endocrine Disorders","publisher":"Springer Nature","subject":"Endocrinology, Diabetes and Metabolism","type":"journal-article","sha":"47b08697c19b66d56e29e95397dda1c086316be5"}
+{"doi":"10.1186/s12917-015-0470-1","title":"Prevalence and molecular heterogeneity of Bartonella bovis in cattle and Haemaphysalis bispinosa ticks in Peninsular Malaysia","authors":["Kai-Ling Kho","Fui-Xian Koh","Tariq Jaafar","Quaza Nizamuddin Hassan Nizam","Sun-Tee Tay"],"year":"2015","journal":"BMC Veterinary Research","publisher":"Springer Nature","subject":"veterinary(all)","type":"journal-article","sha":"6f0ace0df1d70ef73d193ad4c450247d684389af"}
+{"doi":"10.1186/s12917-015-0475-9","title":"Comparison of effectiveness of cefovecin, doxycycline, and amoxicillin for the treatment of experimentally induced early Lyme borreliosis in dogs","authors":["Bettina Wagner","John Johnson","David Garcia-Tapia","Nicole Honsberger","Vickie King","Catherine Strietzel","John M. Hardham","Thomas J. Heinz","Richard T. Marconi","Patrick F. M. Meeus"],"year":"2015","journal":"BMC Veterinary Research","publisher":"Springer Nature","subject":"veterinary(all)","type":"journal-article","sha":"df9ac6070e1c832dc52f872ed9c71c6c7885c812"}
+{"doi":"10.1186/s12929-015-0201-8","title":"Reactive oxygen species contribute to dysfunction of bone marrow hematopoietic stem cells in aged C57BL/6 J mice","authors":["Marcella L. Porto","Bianca P. Rodrigues","Thiago N. Menezes","Sara L. Ceschim","Dulce E. Casarini","Agata L. Gava","Thiago Melo C. Pereira","Elisardo C. Vasquez","Bianca P. Campagnaro","Silvana S. Meyrelles"],"year":"2015","journal":"Journal of Biomedical Science","publisher":"Springer Nature","subject":"Clinical Biochemistry","type":"journal-article","sha":"21cf2cb1d067a39f682c3fa4a97f04f7a9c83724"}
+{"doi":"10.1186/s12936-015-0901-2","title":"Evaluation of case management of uncomplicated malaria in Haiti: a national health facility survey, 2012","authors":["Keren Z. Landman","Samuel E. Jean","Alexandre Existe","Eniko E. Akom","Michelle A. Chang","Jean Frantz Lemoine","Kimberly E. Mace"],"year":"2015","journal":"Malaria Journal","publisher":"Springer Nature","subject":"Parasitology","type":"journal-article","sha":"e5f60ba6da2c039894bb44fe407f0a95c7ca02d3"}
+{"doi":"10.1186/s12955-015-0392-4","title":"Sense of coherence as a mediator of health-related quality of life dimensions in patients with breast cancer: a longitudinal study with prospective design","authors":["Camelia Rohani","Heidar-Ali Abedi","Kay Sundberg","Ann Langius-Eklöf"],"year":"2015","journal":"Health and Quality of Life Outcomes","publisher":"Springer Nature","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"ae0fbfda838057082daa881f607dea76e8101b11"}
+{"doi":"10.1186/s12984-015-0105-6","title":"Specific effects of EEG based neurofeedback training on memory functions in post-stroke victims","authors":["Silvia Erika Kober","Daniela Schweiger","Matthias Witte","Johanna Louise Reichert","Peter Grieshofer","Christa Neuper","Guilherme Wood"],"year":"2015","journal":"Journal of NeuroEngineering and Rehabilitation","publisher":"Springer Nature","subject":"Rehabilitation","type":"journal-article","sha":"ff911fa4d129aa7b6a8ee3b6e285cc53ada12382"}
+{"doi":"10.1186/s13041-015-0108-z","title":"The lateral parabrachial nucleus is actively involved in the acquisition of fear memory in mice","authors":["Masaru Sato","Mariko Ito","Masashi Nagase","Yae K Sugimura","Yukari Takahashi","Ayako M Watabe","Fusao Kato"],"year":"2015","journal":"Molecular Brain","publisher":"Springer Nature","subject":"Molecular Biology","type":"journal-article","sha":"1711f9d40f38bf887674e16752c66d62b5909ee9"}
+{"doi":"10.1186/s13058-015-0549-4","title":"Mesenchymal stem cells mediate the clinical phenotype of inflammatory breast cancer in a preclinical model","authors":["Lara Lacerda","Bisrat G Debeb","Daniel Smith","Richard Larson","Travis Solley","Wei Xu","Savitri Krishnamurthy","Yun Gong","Lawrence B Levy","Thomas Buchholz","Naoto T Ueno","Ann Klopp","Wendy A Woodward"],"year":"2015","journal":"Breast Cancer Research","publisher":"Springer Nature","subject":"Medicine(all)","type":"journal-article","sha":"e33da9ec056ea4a6dd845d25f378ac24241fd062"}
+{"doi":"10.1186/s13068-015-0352-6","title":"Bio-butanol production from glycerol with Clostridium pasteurianum CH4: the effects of butyrate addition and in situ butanol removal via membrane distillation","authors":["De-Shun Lin","Hong-Wei Yen","Wei-Chen Kao","Chieh-Lun Cheng","Wen-Ming Chen","Chieh-Chen Huang","Jo-Shu Chang"],"year":"2015","journal":"Biotechnology for Biofuels","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"af57a39fb1edcb774757d4abaadb6fe2e5c2e966"}
+{"doi":"10.1186/s13072-015-0017-5","title":"lobChIP: from cells to sequencing ready ChIP libraries in a single day","authors":["Ola Wallerman","Helena Nord","Madhusudhan Bysani","Lisa Borghini","Claes Wadelius"],"year":"2015","journal":"Epigenetics & Chromatin","publisher":"Springer Nature","subject":"Genetics","type":"journal-article","sha":"54072a34e47141cc8e3efddfa0777165eb6ca155"}
+{"doi":"10.1186/s13568-014-0026-y","title":"Identification and characterization of endophytic bacteria from corn (Zea mays L.) roots with biotechnological potential in agriculture","authors":["Vivian Jaskiw Szilagyi-Zecchin","Angela Cristina Ikeda","Mariangela Hungria","Douglas Adamoski","Vanessa Kava-Cordeiro","Chirlei Glienke","Lygia Vitória Galli-Terasawa"],"year":"2014","journal":"AMB Express","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"780b31dc293e8e522dfae0be7c4b57e7bfb3cd3d"}
+{"doi":"10.1186/s13661-015-0316-6","title":"The inverse scattering problem of some Schrödinger type equation with turning point","authors":["Zaki FA El-Raheem","Farouk A Salama"],"year":"2015","journal":"Boundary Value Problems","publisher":"Springer Nature","subject":"Algebra and Number Theory","type":"journal-article","sha":"21174e246c8a174bf15328cd9437344f2605c8ac"}
+{"doi":"10.1186/s40064-015-1093-5","title":"Effects of far-infrared sauna bathing on recovery from strength and endurance training sessions in men","authors":["Antti Mero","Jaakko Tornberg","Mari Mäntykoski","Risto Puurtinen"],"year":"2015","journal":"SpringerPlus","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"ebfa5e1a70dcbd5138642516987a8122eafdc966"}
+{"doi":"10.1186/s40478-015-0212-4","title":"Wild type human TDP-43 potentiates ALS-linked mutant TDP-43 driven progressive motor and cortical neuron degeneration with pathological features of ALS","authors":["Jacqueline C Mitchell","Remy Constable","Eva So","Caroline Vance","Emma Scotter","Leanne Glover","Tibor Hortobagyi","Eveline S. Arnold","Shuo-Chien Ling","Melissa McAlonis","Sandrine Da Cruz","Magda Polymenidou","Lino Tessarolo","Don W Cleveland","Christopher E Shaw"],"year":"2015","journal":"Acta Neuropathologica Communications","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"e129d9b54e99fa656095b587fa17cf677249d6b2"}
+{"doi":"10.1186/s40478-016-0302-y","title":"Characteristics of gliomas in patients with somatic IDH mosaicism","authors":["Charlotte Bonnet","Laure Thomas","Dimitri Psimaras","Franck Bielle","Elodie Vauléon","Hugues Loiseau","Stéphanie Cartalat-Carel","David Meyronet","Caroline Dehais","Jérôme Honnorat","Marc Sanson","François Ducray"],"year":"2016","journal":"Acta Neuropathologica Communications","publisher":"Springer Nature","subject":"","type":"journal-article","sha":"71d6c224b2abbf74ba03f439f946323ebaa3199b"}
+{"doi":"10.1186/s40693-015-0041-7","title":"Spatial distribution pattern of Mytilus chilensis beds in the Reloncaví fjord: hypothesis on associated processes","authors":["Carlos Alberto Molinet Flores","Manuel Alejandro Díaz Gomez","Camilo Bernardo Arriagada Muñoz","Leny Eunise Cares Pérez","Sandra Lorena Marín Arribas","Marcela Patricia Astorga Opazo","Edwin Juan Elías Niklitschek Huaquin"],"year":"2015","journal":"Revista Chilena de Historia Natural","publisher":"Springer Nature","subject":"Agricultural and Biological Sciences(all)","type":"journal-article","sha":"1d7d96cd4225193a43a38f9523b0948fe11e2f81"}
+{"doi":"10.1190/geo2013-0110.1","title":"Seismic ray tracing in anisotropic media: A modified Newton algorithm for solving highly nonlinear systems","authors":["Yanghua Wang"],"year":"2014","journal":"GEOPHYSICS","publisher":"Society of Exploration Geophysicists","subject":"Geochemistry and Petrology","type":"journal-article","sha":"bbc1ff65d8d74296bcd8d4f2984a9ff42b9a031e"}
+{"doi":"10.1198/004017002188618716","title":"Principal Components Regression With Data Chosen Components and Related Methods","authors":["J. T. Gene Hwang","Dan Nettleton"],"year":"2003","journal":"Technometrics","publisher":"Informa UK Limited","subject":"Modelling and Simulation","type":"journal-article","sha":"bb63ab9110158eb8ae24522a226615d878056776"}
+{"doi":"10.1214/105051607000000221","title":"Survival and complete convergence for a spatial branching system with local regulation","authors":["Matthias Birkner","Andrej Depperschmidt"],"year":"2007","journal":"The Annals of Applied Probability","publisher":"Institute of Mathematical Statistics","subject":"Statistics, Probability and Uncertainty","type":"journal-article","sha":"73065f6455d1d50d74d278397e0b5554469209b1"}
+{"doi":"10.1242/dev.027383","title":"Mechanisms and variation in plant development: sorting the wood from the trees in Vermont","authors":["G. Coupland"],"year":"null","journal":"Development","publisher":"The Company of Biologists","subject":"Developmental Biology","type":"journal-article","sha":"2cdc41c9aa3388221ad0189450d24e379fa33ab0"}
+{"doi":"10.1242/jcs.00068","title":"How to survive in a cruel world: Plant Signal Transduction","authors":["M. A. Blazquez"],"year":"2002","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"3ebf650663187b291c2033b2abb824f9adcea9f0"}
+{"doi":"10.1242/jcs.00153","title":"Caspase-dependent initiation of apoptosis and necrosis by the Fas receptor in lymphoid cells: onset of necrosis is associated with delayed ceramide increase","authors":["C. A. Hetz"],"year":"2002","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"9acee83f483196c813c5b9ec2e153c0e3e4d7b0b"}
+{"doi":"10.1242/jcs.02320","title":"Cytosolic tail sequences and subunit interactions are critical for synaptic localization of glutamate receptors","authors":["H. C.-H. Chang"],"year":"2005","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"b16a745154dc249c421345eebc329f82d4168af8"}
+{"doi":"10.1242/jcs.028241","title":"The class I bHLH factors E2-2A and E2-2B regulate EMT","authors":["V. R. Sobrado","G. Moreno-Bueno","E. Cubillo","L. J. Holt","M. A. Nieto","F. Portillo","A. Cano"],"year":"2009","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"45ac32bf7a6ad8f03f3a5a63d0df3a27dca981fc"}
+{"doi":"10.1242/jcs.043042","title":"Recruitment of vimentin to the cell surface by  3 integrin and plectin mediates adhesion strength","authors":["R. Bhattacharya","A. M. Gonzalez","P. J. DeBiase","H. E. Trejo","R. D. Goldman","F. W. Flitney","J. C. R. Jones"],"year":"2009","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"79458814708fddba9212b75bdbb6cf7e4e0780ac"}
+{"doi":"10.1242/jcs.104018","title":"Filamin A controls matrix metalloproteinase activity and regulates cell invasion in human fibrosarcoma cells","authors":["M. Baldassarre","Z. Razinia","N. N. Brahme","R. Buccione","D. A. Calderwood"],"year":"2012","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"362c2c71c7cbb740afee65c82451241336cb727d"}
+{"doi":"10.1242/jcs.114082","title":"Cooperativity between calmodulin-binding sites in Kv7.2 channels","authors":["Alessandro Alaimo","Araitz Alberdi","Carolina Gomis-Perez","Juncal Fernández-Orth","Juan Camilo Gómez-Posada","Pilar Areso","Alvaro Villarroel"],"year":"2013","journal":"Journal of Cell Science","publisher":"The Company of Biologists","subject":"Cell Biology","type":"journal-article","sha":"5e9d2fe5c249597e962c4ed5403e7083ca5d3658"}
+{"doi":"10.1242/jeb.126458","title":"Aeroelastic flutter of feathers, flight and the evolution of non-vocal communication in birds","authors":["C. J. Clark","R. O. Prum"],"year":"2015","journal":"Journal of Experimental Biology","publisher":"The Company of Biologists","subject":"Insect Science","type":"journal-article","sha":"4ea1915c88c66d971f2e525a37a780e54930af18"}
+{"doi":"10.1248/bpb.b12-00543","title":"Anti-diabetic Action of 7-O-Galloyl-d-sedoheptulose, a Polyphenol from Corni Fructus, through Ameliorating Inflammation and Inflammation-Related Oxidative Stress in the Pancreas of Type 2 Diabetics","authors":["Chan Hum Park","Takashi Tanaka","Takako Yokozawa"],"year":"2013","journal":"Biological and Pharmaceutical Bulletin","publisher":"Pharmaceutical Society of Japan","subject":"Pharmacology","type":"journal-article","sha":"892b7a96ea8da0649dc1f0b302f6a9b9b2a59c0d"}
+{"doi":"10.1248/cpb.58.135","title":"Recycling and Catalytic Approaches for the Development of a Rare-Metal-Free Synthetic Method Using Hypervalent Iodine Reagent","authors":["Toshifumi Dohi"],"year":"2010","journal":"CHEMICAL & PHARMACEUTICAL BULLETIN","publisher":"Pharmaceutical Society of Japan","subject":"Chemistry(all)","type":"journal-article","sha":"113edd22f6e661bbdd3ca07752db9f086460d975"}
+{"doi":"10.1257/aer.102.1.131","title":"The Environment and Directed Technical Change","authors":["Daron Acemoglu","Philippe Aghion","Leonardo Bursztyn","David Hemous"],"year":"2012","journal":"American Economic Review","publisher":"American Economic Association","subject":"Economics and Econometrics","type":"journal-article","sha":"08ab85041fe4023ff5fbbec7bbdc48c8d9ca4b11"}
+{"doi":"10.1258/ult.2012.011055","title":"Effect of patient positioning on the duration of venous reflux in duplex ultrasound for venous insufficiency","authors":["M Bonfield","F Cramp","T Robinson"],"year":"2012","journal":"Ultrasound","publisher":"SAGE Publications","subject":"Radiological and Ultrasound Technology","type":"journal-article","sha":"a1fb3878079ccb07e6b5ce8503e5dbe12fa6fe97"}
+{"doi":"10.1266/ggs.79.345","title":"Effect of single alien chromosome from shallot (Allium cepa L. Aggregatum group) on carbohydrate production in leaf blade of bunching onion (A. fistulosum L.)","authors":["Tran Thi Minh Hang","Masayoshi Shigyo","Shigenori Yaguchi","Naoki Yamauchi","Yosuke Tashiro"],"year":"2004","journal":"Genes & Genetic Systems","publisher":"Genetics Society of Japan","subject":"Genetics","type":"journal-article","sha":"e9a1de7171a538437bc06a2c6c67f2eb1c1ff84d"}
+{"doi":"10.1267/ahc.13005","title":"Involvement of Leptin in the Progression of Experimentally Induced Peritoneal Fibrosis in Mice","authors":["Masayuki Nakazawa","Yoko Obata","Tomoya Nishino","Shinichi Abe","Yuka Nakazawa","Katsushige Abe","Akira Furusu","Masanobu Miyazaki","Takehiko Koji","Shigeru Kohno"],"year":"2013","journal":"ACTA HISTOCHEMICA ET CYTOCHEMICA","publisher":"Japan Society of Histochemistry & Cytochemistry","subject":"Pathology and Forensic Medicine","type":"journal-article","sha":"c3bac0531492c19b1d861ead939f3d9807a8f0a4"}
+{"doi":"10.12691/ajeee-1-2-1","title":"A High Resolution First Order Noise-Shaping Vernier Time-to-Digital Converter","authors":["Majid Memarian Sorkhabi","Siroos Toofan"],"year":"2013","journal":"American Journal of Electrical and Electronic Engineering","publisher":"Science and Education Publishing Co., Ltd.","subject":"","type":"journal-article","sha":"99ccdbabfaafa1f0384f7b9ada265195a1fa53a8"}
+{"doi":"10.12785/amis/070506","title":"Approximate Completed Trace Equivalence of Three Dimensional t-Model Nonlinear Algebraic Hybrid Systems","authors":["Hao Yang","Jinzhao Wu","Zhiwei Zhang","Yang Liu"],"year":"2013","journal":"Applied Mathematics & Information Sciences","publisher":"Scientific Publishing Center","subject":"","type":"journal-article","sha":"f1755779197ed17483f776a3a9627638192d0123"}
+{"doi":"10.1287/educ.1053.0017","title":"Branch and Tree Decomposition Techniques for Discrete Optimization","authors":["Illya V. Hicks","Arie M. C. A. Koster","Elif KolotoÄŸlu"],"year":"2005","journal":"Emerging Theory, Methods, and Applications","publisher":"Institute for Operations Research and the Management Sciences (INFORMS)","subject":"","type":"book-chapter","sha":"5424fdf309d72ee1824291243bf4115db6ea5b13"}
+{"doi":"10.1287/mnsc.1040.0317","title":"An Empirical Analysis of Forecast Sharing in the Semiconductor Equipment Supply Chain","authors":["Christian Terwiesch","Z. Justin Ren","Teck H. Ho","Morris A. Cohen"],"year":"2005","journal":"Management Science","publisher":"Institute for Operations Research and the Management Sciences (INFORMS)","subject":"Management Science and Operations Research","type":"journal-article","sha":"b4ffc1b64f875211656c8211fa3bd409182d06be"}
+{"doi":"10.1287/mnsc.2015.2211","title":"Career Prospects and Effort Incentives: Evidence from Professional Soccer","authors":["Jeanine Miklós-Thal","Hannes Ullrich"],"year":"2016","journal":"Management Science","publisher":"Institute for Operations Research and the Management Sciences (INFORMS)","subject":"Management Science and Operations Research","type":"journal-article","sha":"2f4c1043920382b288e59cd88a3357bf0432fb01"}
+{"doi":"10.1289/ehp.1306945","title":"Health Risks from Lead-Based Ammunition in the Environment","authors":["David C. Bellinger","Joanna Burger","Tom J. Cade","Deborah A. Cory-Slechta","Myra Finkelstein","Howard Hu","Michael Kosnett","Philip J. Landrigan","Bruce Lanphear","Mark A. Pokras","Patrick T. Redig","Bruce A. Rideout","Ellen Silbergeld","Robert Wright","Donald R. Smith"],"year":"null","journal":"Environmental Health Perspectives","publisher":"Environmental Health Perspectives","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"549effe64f6babe6155455d08b8c1d56e96c70ef"}
+{"doi":"10.1289/ehp.1307918","title":"Neurobehavioral Function in School-Age Children Exposed to Manganese in Drinking Water","authors":["Youssef Oulhote","Donna Mergler","Benoit Barbeau","David C. Bellinger","Thérèse Bouffard","Marie-Ève Brodeur","Dave Saint-Amour","Melissa Legrand","Sébastien Sauvé","Maryse F. Bouchard"],"year":"null","journal":"Environmental Health Perspectives","publisher":"Environmental Health Perspectives","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"87c3f9e7bfe5800202f956f7c7ce34705e5d112d"}
+{"doi":"10.1299/jmmp.1.779","title":"Fracture Behavior of Micro-Sized Fe-3%Si Alloy Single Crystals","authors":["Eiji TAKI","Yuji KAWAKAMI","Masaaki OTSU","Kazuki TAKASHIMA"],"year":"2007","journal":"Journal of Solid Mechanics and Materials Engineering","publisher":"Japan Society of Mechanical Engineers","subject":"","type":"journal-article","sha":"7987b71855654ab32c7d53420111b0cbff8bd8e4"}
+{"doi":"10.1348/000711009x456935","title":"Testing and modelling non-normality within the one-factor model","authors":["Dylan Molenaar","Conor V. Dolan","Norman D. Verhelst"],"year":"2010","journal":"British Journal of Mathematical and Statistical Psychology","publisher":"Wiley-Blackwell","subject":"Statistics and Probability","type":"journal-article","sha":"1ceb3a7565ee6676d0ca7b459f851f650c7a243f"}
+{"doi":"10.1364/ao.42.003583","title":"Versatile mobile lidar system for environmental monitoring","authors":["Petter Weibring","Hans Edner","Sune Svanberg"],"year":"2003","journal":"Applied Optics","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"b9f481f1084cb7b31a5fd3127a3c76c1e22003ac"}
+{"doi":"10.1364/ao.42.005130","title":"Mirrors with regular hexagonal segments","authors":["Dario Amodei","Stephen Padin"],"year":"2003","journal":"Applied Optics","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"19d5d3272074f3d49879cd7c97c28dcc633b3b76"}
+{"doi":"10.1364/jocn.4.000449","title":"Design of Energy-Saving Algorithms for Hybrid Fiber Coaxial Networks Based on the DOCSIS 30 Standard","authors":["Zuqing Zhu"],"year":"2012","journal":"Journal of Optical Communications and Networking","publisher":"The Optical Society","subject":"Computer Networks and Communications","type":"journal-article","sha":"acac3aae8cba2743b2dc6e1708a2a98daa96eea3"}
+{"doi":"10.1364/josaa.19.000158","title":"Structural modeling of contrast sensitivity in adulthood","authors":["Charles T. Scialfa","Donald W. Kline","Philip K. Wood"],"year":"2002","journal":"Journal of the Optical Society of America A","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"a75a17f2539deb121f554dcc5bddea499451ae05"}
+{"doi":"10.1364/josab.15.001476","title":"Spatial walking solitons in quadratic nonlinear crystals","authors":["Lluis Torner","Dumitru Mihalache","Dumitru Mazilu","Maria C. Santos","Nail N. Akhmediev"],"year":"1998","journal":"Journal of the Optical Society of America B","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"e0864d996ae0acf38717a96312f8c6b4ad01f7a7"}
+{"doi":"10.1364/josab.24.002821","title":"Effects of mode degeneracy in the LIGO Livingston Observatory recycling cavity","authors":["Andri M. Gretarsson","Erika D'Ambrosio","Valery Frolov","Brian O'Reilly","Peter K. Fritschel"],"year":"2007","journal":"Journal of the Optical Society of America B","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"214447a16cf3afab71f39f2b0576f3e121fe0573"}
+{"doi":"10.1364/josab.26.001688","title":"Direct observation of negative refraction at the millimeter-wave regime by using a flat composite metamaterial","authors":["Kamil Boratay Alici","Ekmel Ozbay"],"year":"2009","journal":"Journal of the Optical Society of America B","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"a3608f1c7182c52a1850cbed8f567c582dfdd9c0"}
+{"doi":"10.1364/oe.19.024569","title":"Electrically pumped silicon waveguide light sources","authors":["Hasitha Jayatilleka","Arsam Nasrollahy-Shiraz","Anthony J. Kenyon"],"year":"2011","journal":"Optics Express","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"0468b39bf6361b07c0fb3a02b06751e2b3a73636"}
+{"doi":"10.1364/ol.21.001756","title":"Pulse shaping of incoherent light by use of a liquid-crystal modulator array","authors":["V. Binjrajka","C.-C. Chang","A. W. R. Emanuel","D. E. Leaird","A. M. Weiner"],"year":"1996","journal":"Optics Letters","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"beb8750fbdcba603a9362745c62fdd1ede63051f"}
+{"doi":"10.1364/ol.25.001141","title":"Fiber Bragg grating sensor for simultaneous measurement of displacement and temperature","authors":["Youlong Yu","Hwayaw Tam","Wenghong Chung","Muhtesem Suleyman Demokan"],"year":"2000","journal":"Optics Letters","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"829a0774aa383b4f2465857214edd810c498fc26"}
+{"doi":"10.1364/ol.26.001612","title":"Single-shot spectral interferometry with chirped pulses","authors":["J.-P. Geindre","P. Audebert","S. Rebibo","J.-C. Gauthier"],"year":"2001","journal":"Optics Letters","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"6a3e7631623942c5ec0b967099d500164746f9fd"}
+{"doi":"10.1364/ol.28.000227","title":"Statistics of polarization-dependent gain in fiber-based Raman amplifiers","authors":["Q. Lin","Govind P. Agrawal"],"year":"2003","journal":"Optics Letters","publisher":"The Optical Society","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"d9cc640d9b58a0588039a42d6e5dc634bbab4d72"}
+{"doi":"10.1371/journal.pcbi.0020153","title":"Structural Modeling of Protein Interactions by Analogy: Application to PSD-95","authors":["Dmitry Korkin","Fred P. Davis","Frank Alber","Tinh Luong","Min-Yi Shen","Vladan Lucic","Mary B. Kennedy","Andrej Sali"],"year":"2006","journal":"PLoS Computational Biology","publisher":"Public Library of Science (PLoS)","subject":"Ecology","type":"journal-article","sha":"87e576472d0676dd2e3140ae077cba3f80f6c96f"}
+{"doi":"10.1373/clinchem.2003.023135","title":"Stability of Plasma Free Metanephrines during Collection and Storage as Assessed by an Optimized HPLC Method with Electrochemical Detection","authors":["J. J. Willemsen"],"year":"2003","journal":"Clinical Chemistry","publisher":"American Association for Clinical Chemistry (AACC)","subject":"Clinical Biochemistry","type":"journal-article","sha":"729580d0ef1dd3831a8c513ce83d97a232ba2abe"}
+{"doi":"10.1373/clinchem.2005.053181","title":"Integrated Microfluidic Compact Disc Device with Potential Use in Both Centralized and Point-of-Care Laboratory Settings","authors":["M. Inganas"],"year":"2005","journal":"Clinical Chemistry","publisher":"American Association for Clinical Chemistry (AACC)","subject":"Clinical Biochemistry","type":"journal-article","sha":"59e82bea1a8c5b58cd7521e22c7d91b4fc5ada78"}
+{"doi":"10.1373/clinchem.2008.112102","title":"European External Quality Control Study on the Competence of Laboratories to Recognize Rare Sequence Variants Resulting in Unusual Genotyping Results","authors":["J. Marki-Zay","C. L. Klein","D. Gancberg","H. G. Schimmel","L. Dux"],"year":"2009","journal":"Clinical Chemistry","publisher":"American Association for Clinical Chemistry (AACC)","subject":"Clinical Biochemistry","type":"journal-article","sha":"ccedb90e95bc0b5b4de91236f6c064f82af726b7"}
+{"doi":"10.1373/clinchem.2011.172726","title":"Is the WHO 90:10 Prostate-Specific Antigen (PSA) First International Reference Standard Really 90%  1-Antichymotrypsin-Bound PSA and 10% Free PSA?","authors":["L. J. Sokoll","S. Rosenwald","J. Lyons","D. J. Elliott","D. W. Chan"],"year":"2011","journal":"Clinical Chemistry","publisher":"American Association for Clinical Chemistry (AACC)","subject":"Clinical Biochemistry","type":"journal-article","sha":"eae9e94e9b77cedb8b592fc878477ba5de2c8fdf"}
+{"doi":"10.1373/clinchem.2011.173385","title":"In Search for a Better Marker of Acute Pancreatitis--Third Time Lucky?","authors":["A. Viljoen","J. T. Patrick"],"year":"2011","journal":"Clinical Chemistry","publisher":"American Association for Clinical Chemistry (AACC)","subject":"Clinical Biochemistry","type":"journal-article","sha":"d37edd4b819188ce01e6dfc92fa9442e02658660"}
+{"doi":"10.1378/chest.82.6.678","title":"Patients with Myocardial Infarction and Normal Coronary Arteriogram","authors":["V. Legrand","M. Deliege","L. Henrard","J. Boland","H. Kulbertus"],"year":"1982","journal":"Chest","publisher":"Elsevier BV","subject":"Critical Care and Intensive Care Medicine","type":"journal-article","sha":"dd9f62c56cec719e8c45968f59f75ad8390e3e99"}
+{"doi":"10.1385/criai:20:3:271","title":"Status Asthmaticus: From the Emergency Department to the Intensive Care Unit","authors":["Nicholas Kenyon","Timothy E Albertson"],"year":"null","journal":"Clinical Reviews in Allergy & Immunology","publisher":"Springer Nature","subject":"Immunology and Allergy","type":"journal-article","sha":"6ff83beffaa58baf8e44b5d465b3f6ea14086192"}
+{"doi":"10.1386/pjss.4.1.3/1","title":"The origins of the welfare state in Portugal: the new frontiers between public and private*","authors":["Miriam Halpern Pereira"],"year":"2005","journal":"Portugese Journal of Social Sciences","publisher":"Intellect","subject":"","type":"journal-article","sha":"2f1f1d6a8b0600da9d27f3d557824716f0e2d97c"}
+{"doi":"10.14321/realanalexch.35.2.0343","title":"Continuous Rigid Functions","authors":[" Christian Richter"],"year":"2010","journal":"Real Analysis Exchange","publisher":"Michigan State University Press","subject":"","type":"journal-article","sha":"f047c5a1e547e517d9ea4088d2bdb730087dd1d6"}
+{"doi":"10.14778/2733004.2733039","title":"Ontology assisted crowd mining","authors":["Yael Amsterdamer","Susan B. Davidson","Tova Milo","Slava Novgorodov","Amit Somech"],"year":"2014","journal":"Proceedings of the VLDB Endowment","publisher":"VLDB Endowment","subject":"","type":"journal-article","sha":"e65d4eac06f9d74154db0713ed48a7f64df89ab4"}
+{"doi":"10.1504/ijarge.2005.007456","title":"Including non-trade concerns: the environment in EU and US agricultural policy","authors":["Kathy Baylis","Gordon C. Rausser","Leo K. Simon"],"year":"2005","journal":"International Journal of Agricultural Resources, Governance and Ecology","publisher":"Inderscience Publishers","subject":"Agronomy and Crop Science","type":"journal-article","sha":"c0dd1d6d5d11faffe243bdb6a729b6ea7629ba59"}
+{"doi":"10.15171/ijhpm.2015.07","title":"Shanghai rising: health improvements as measured by avoidable mortality since 2000","authors":["Michael K. Gusmano","Victor G. Rodwin","Chunfang Wang","Daniel Weisz","Li Luo","Fu Hua"],"year":"null","journal":"International Journal of Health Policy and Management","publisher":"International Society for Phytocosmetic Sciences","subject":"","type":"journal-article","sha":"b0a3aaa36bb1b1b7f33122d448966f71a3737021"}
+{"doi":"10.1523/jneurosci.1039-10.2010","title":"Maternal Care and DNA Methylation of a Glutamic Acid Decarboxylase 1 Promoter in Rat Hippocampus","authors":["T.-Y. Zhang","I. C. Hellstrom","R. C. Bagot","X. Wen","J. Diorio","M. J. Meaney"],"year":"2010","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"e9e863fdc9d6a3d4563649bdd782e7c25da12337"}
+{"doi":"10.1523/jneurosci.1309-09.2009","title":"A Janus-Like Role of CREB Protein: Enhancement of Synaptic Property in Mature Neurons and Suppression of Synaptogenesis and Reduced Network Synchrony in Early Development","authors":["M. Nonaka"],"year":"2009","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"c80e94a911e2f77cdce8a9f26adfe9e98e70e837"}
+{"doi":"10.1523/jneurosci.1527-13.2013","title":"Number of Spikes in Climbing Fibers Determines the Direction of Cerebellar Learning","authors":["A. Rasmussen","D.-A. Jirenhed","R. Zucca","F. Johansson","P. Svensson","G. Hesslow"],"year":"2013","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"15cedc19e97c839a3b6ce923a3b88ac6a6f5352d"}
+{"doi":"10.1523/jneurosci.2349-11.2011","title":"ERK2 Contributes to the Control of Social Behaviors in Mice","authors":["Y. Satoh","S. Endo","T. Nakata","Y. Kobayashi","K. Yamada","T. Ikeda","A. Takeuchi","T. Hiramoto","Y. Watanabe","T. Kazama"],"year":"2011","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"7a73195aa60c338ed051ab3623c82709ca38e9de"}
+{"doi":"10.1523/jneurosci.3140-11.2011","title":"The Relationship between Brain Oscillations and BOLD Signal during Memory Formation: A Combined EEG-fMRI Study","authors":["S. Hanslmayr","G. Volberg","M. Wimber","M. Raabe","M. W. Greenlee","K.-H. T. Bauml"],"year":"2011","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"5719ade214d6548302cd50900195943d22dd50fa"}
+{"doi":"10.1523/jneurosci.4036-14.2015","title":"Microglia Disrupt Mesolimbic Reward Circuitry in Chronic Pain","authors":["A. M. W. Taylor","A. Castonguay","A. J. Taylor","N. P. Murphy","A. Ghogha","C. Cook","L. Xue","M. C. Olmstead","Y. De Koninck","C. J. Evans","C. M. Cahill"],"year":"2015","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"711538df4f98f131626c44bd6f3383a2376d9199"}
+{"doi":"10.1523/jneurosci.4846-11.2012","title":"Bone Marrow Transplantation Confers Modest Benefits in Mouse Models of Huntington's Disease","authors":["W. Kwan","A. Magnusson","A. Chou","A. Adame","M. J. Carson","S. Kohsaka","E. Masliah","T. Moller","R. Ransohoff","S. J. Tabrizi","M. Bjorkqvist","P. J. Muchowski"],"year":"2012","journal":"Journal of Neuroscience","publisher":"Society for Neuroscience","subject":"Neuroscience(all)","type":"journal-article","sha":"c2ffbb4ec03c60a144d06736917fb9e251695ab5"}
+{"doi":"10.1542/peds.2009-2175","title":"Accuracy of MUAC in the Detection of Severe Wasting With the New WHO Growth Standards","authors":["M. A. L. Fernandez","P. Delchevalerie","M. van Herp"],"year":"2010","journal":"PEDIATRICS","publisher":"American Academy of Pediatrics (AAP)","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"fc5b3c87ee44687d4026b86db8f92f219c8163f5"}
+{"doi":"10.1542/peds.2014-3701","title":"Congenital Left Paraduodenal Hernia Causing Chronic Abdominal Pain and Abdominal Catastrophe","authors":["Y. Shi","A. E. Felsted","P. M. Masand","B. A. Mothner","J. G. Nuchtern","J. R. Rodriguez","S. A. Vasudevan"],"year":"2015","journal":"PEDIATRICS","publisher":"American Academy of Pediatrics (AAP)","subject":"Pediatrics, Perinatology, and Child Health","type":"journal-article","sha":"34958f9d0a47436f538ba2f263ff03626d551f04"}
+{"doi":"10.15517/rbt.v55i0.5806","title":"Population and reproductive biology of the crab Uca burgersi (Crustacea: Ocypodidae) in three subtropical mangrove forests","authors":["A.S. Benetti","M.L. Negreiros-Fransozo - T.M. Costa"],"year":"null","journal":"Revista de Biología Tropical","publisher":"Universidad de Costa Rica","subject":"Agricultural and Biological Sciences(all)","type":"journal-article","sha":"6ba0bce8b868f12f3cd31e9c3996dcff37348438"}
+{"doi":"10.1557/mrs2008.25","title":"Visualizing the Behavior of Dislocations—Seeing is Believing","authors":["Ian M. Robertson","Paulo J. Ferreira","Gerhard Dehm","Robert Hull","Eric A. Stach"],"year":"2008","journal":"MRS Bulletin","publisher":"Cambridge University Press (CUP)","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"354ef362d55b520bdc7e79b88b65b4a1ec92cb59"}
+{"doi":"10.1557/proc-605-135","title":"Selective W for Coating and Releasing MEMS Devices","authors":["S. S. Mani","J. G. Fleming","J. J. Sniegowski","M. P. de Boer","L. W. Irwin","J. A. Walraven","D. M. Tanner","D. A. La Van"],"year":"1999","journal":"MRS Proceedings","publisher":"Cambridge University Press (CUP)","subject":"","type":"journal-article","sha":"133b9438d862e462e5fa15993734e2359790ad25"}
+{"doi":"10.1590/0004-2730000003176","title":"Preproghrelin polymorphism Q90L (rs4684677) in gestational diabetes","authors":["Rafaela Andrade Rocha","Henrique Ravanhol Frigeri","Izabella Castilhos Ribeiro dos Santos-Weiss","Rosângela Roginski Réa","Emanuel Maltempi de Souza","Fabiane Gomes de Moraes Rego","Geraldo Picheth"],"year":"null","journal":"Arquivos Brasileiros de Endocrinologia & Metabologia","publisher":"FapUNIFESP (SciELO)","subject":"Endocrinology, Diabetes and Metabolism","type":"journal-article","sha":"17043b885f1981ce79cb3dcb8d2fd5004c9c14b8"}
+{"doi":"10.1590/0037-8682-0231-2013","title":"Diagnosing schistosomiasis: where are we?","authors":["Luciana Inácia Gomes","Martin Johannes Enk","Ana Rabello"],"year":"null","journal":"Revista da Sociedade Brasileira de Medicina Tropical","publisher":"FapUNIFESP (SciELO)","subject":"Microbiology (medical)","type":"journal-article","sha":"723e4fae1cf2518ac4d62db198d204174bec1075"}
+{"doi":"10.1590/s0004-27302008000300005","title":"Gasto energético corporal: conceitos, formas de avaliação e sua relação com a obesidade","authors":["Camila Maria de Melo","Julio Tirapegui","Sandra Maria Lima Ribeiro"],"year":"null","journal":"Arquivos Brasileiros de Endocrinologia & Metabologia","publisher":"FapUNIFESP (SciELO)","subject":"Endocrinology, Diabetes and Metabolism","type":"journal-article","sha":"d22015b357a39b49942c45ca817b281bd0380f01"}
+{"doi":"10.1590/s0004-27302009000600010","title":"Taxa metabólica de repouso e composição corporal em mulheres na pós-menopausa","authors":["Valéria Bonganha","Miguel Soares Conceição","Claudinei Ferreira dos Santos","Mara Patrícia Traína Chacon-Mikahil","Vera Aparecida Madruga"],"year":"null","journal":"Arquivos Brasileiros de Endocrinologia & Metabologia","publisher":"FapUNIFESP (SciELO)","subject":"Endocrinology, Diabetes and Metabolism","type":"journal-article","sha":"4ecc09d70c152bc695be800f7cc02ceaab038f51"}
+{"doi":"10.1590/s0004-27302013000200006","title":"Assessment of the elasticity properties of the ascending aorta in patients with subclinical hypothyroidism by tissue Doppler imaging","authors":["Mustafa Yurtdaş","Ramazan Gen","Turkay Özcan","Mehmet Kasım Aydın"],"year":"null","journal":"Arquivos Brasileiros de Endocrinologia & Metabologia","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"1b12efebe1b24f89953322a970c38f0ed380e8a8"}
+{"doi":"10.1590/s0004-282x2002000100003","title":"Preservation of the olfactory tract in bifrontal craniotomy","authors":["Paulo H. Aguiar","Guilherme A. Pulici","Leonardo O. Lourenco","Juan A.C. Flores","Valter A. Cescato"],"year":"null","journal":"Arquivos de Neuro-Psiquiatria","publisher":"FapUNIFESP (SciELO)","subject":"Biological Psychiatry","type":"journal-article","sha":"ede6e6f881251fabcd62fb2b42dff8c539812a5f"}
+{"doi":"10.1590/s0034-70942002000200011","title":"Hipercapnia acentuada durante circulação extracorpórea em cirurgia para revascularização do miocárdio: relato de caso","authors":["Maurício Serrano Nascimento","Cassiano Franco Bernardes","Roberta Louro de Medeiros"],"year":"null","journal":"Revista Brasileira de Anestesiologia","publisher":"Elsevier BV","subject":"Anesthesiology and Pain Medicine","type":"journal-article","sha":"c99e64426737d1c895436c5c456a74dcbc678cca"}
+{"doi":"10.1590/s0034-71081999000100009","title":"Porcellanid crabs (Crustacea, Decapoda) inhabiting sand reefs built by Phragmatopoma lapidosa (Polychaeta Sabellariidae) at Paranapuã beach , São Vicente, SP, Brazil","authors":["C.V. MICHELETTI-FLORES","M. L. NEGREIROS-FRANSOZO"],"year":"null","journal":"Revista Brasileira de Biologia","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"24899b7224b6ad9ca66fdce5471f30060eab5957"}
+{"doi":"10.1590/s0034-72992004000300010","title":"Avaliação do perfil auditivo de militares de um quartel do Exército Brasileiro","authors":["Ana P. Silva","Everardo A. da Costa","Salete M. M. Rodrigues","Humberto L. R. Souza","Valéria G. Massafera"],"year":"null","journal":"Revista Brasileira de Otorrinolaringologia","publisher":"FapUNIFESP (SciELO)","subject":"Otorhinolaryngology","type":"journal-article","sha":"1b5dccaded1bcc516c03cdd412d8d8cc62bc05d1"}
+{"doi":"10.1590/s0034-737x2012000500004","title":"Análises técnicas e econômicas no sistema de integração lavoura-pecuária submetido à adubação nitrogenada","authors":["Nídia Raquel Costa","Marcelo Andreotti","Máila Terra Gioia","Maria Aparecida Anselmo Tarsitano","Cristiano Magalhães Pariz","Salatiér Buzetti"],"year":"null","journal":"Revista Ceres","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"60341a791652d9bd9c2348416b472ebfb8e85e48"}
+{"doi":"10.1590/s0034-759020140607","title":"Antecedentes de las intenciones de abandono en cooperativas colombianas","authors":["Juan Pablo Román-Calderón","Adalgisa Battistelli","Mario Vargas-Saenz"],"year":"null","journal":"Revista de Administração de Empresas","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"485bca3c99c404801c31e1a9e0d0765773ecd7f3"}
+{"doi":"10.1590/s0034-77012000000200006","title":"Notas sobre a análise antropológica de setores do Estado brasileiro","authors":["Ciméa Bevilaqua","Piero de Camargo Leirner"],"year":"null","journal":"Revista de Antropologia","publisher":"FapUNIFESP (SciELO)","subject":"Social Sciences(all)","type":"journal-article","sha":"bcb9603cdb8d9412b1ffbb88e00e614839b5ce72"}
+{"doi":"10.1590/s0071-12761986000200005","title":"Qualidade fisiológica e comportamento de sementes de soja (Glycine max (L.) Merrill) no armazenamento e no campo","authors":["J. Marcos Filho","R. V. de Carvalho","S. M. Cicero","C. G. B. Demétrio"],"year":"null","journal":"Anais da Escola Superior de Agricultura Luiz de Queiroz","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"81c1a528f34d2e4793451a1e48e2a7c0366e57f3"}
+{"doi":"10.1590/s0074-02761917000100001","title":"Contribuições para o conhecimento da fauna helmintolojica brazileira","authors":["Lauro Travassos"],"year":"null","journal":"Memórias do Instituto Oswaldo Cruz","publisher":"FapUNIFESP (SciELO)","subject":"Microbiology (medical)","type":"journal-article","sha":"fde4538871c108f21408397eae572d1299004926"}
+{"doi":"10.1590/s0074-02761987000500007","title":"Cloning of genes for antigenically relevant proteins of Trypanosoma cruzi","authors":["Alberto C. C. Frasch","Jose L. Affranchino","Carlos Ibañez","Roberto E. Macina","Maria B. Reys","Mario E. Camargo","Lena Aslud","Ulf Petterson"],"year":"null","journal":"Memórias do Instituto Oswaldo Cruz","publisher":"FapUNIFESP (SciELO)","subject":"Microbiology (medical)","type":"journal-article","sha":"4c656f720abc3c0b3efee77c194fb16654c457b9"}
+{"doi":"10.1590/s0100-46702010000400011","title":"Determination of copper at wide range concentrations using instrumental features of high-resolution continuum source flame atomic absorption spectrometry","authors":["Renata Toledo Lima","Jorge Luiz Raposo Jr.","Alex Virgílio","José Anchieta Gomes Neto"],"year":"null","journal":"Eclética Química","publisher":"FapUNIFESP (SciELO)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"72cdaa5df897570a795033a04e365f8e4d69eb74"}
+{"doi":"10.1590/s0100-84042002000200012","title":"Estudo comparativo da fenologia de nove espécies arbóreas em três tipos de floresta atlântica no sudeste do Brasil","authors":["CINARA S.C. BENCKE","L. PATRÃCIA C. MORELLATO"],"year":"null","journal":"Revista Brasileira de Botânica","publisher":"FapUNIFESP (SciELO)","subject":"Plant Science","type":"journal-article","sha":"31fffb87d80647a955eb5357362826f76571c68f"}
+{"doi":"10.1590/s0100-84042003000200001","title":"Distribuição espacial da flora terrestre fanerogâmica do Parque Nacional Marinho de Abrolhos, BA","authors":["Alexandre Kemenes"],"year":"null","journal":"Revista Brasileira de Botânica","publisher":"FapUNIFESP (SciELO)","subject":"Plant Science","type":"journal-article","sha":"70f6bb41ce86bee069440cd5bd0b62555bc0112b"}
+{"doi":"10.1590/s0100-84042004000100015","title":"Anatomia e ultra-estrutura do pulvino primário de Pterodon pubescens Benth. (Fabaceae - Faboideae)","authors":["Sílvia R. Machado","Tatiane M. Rodrigues"],"year":"null","journal":"Revista Brasileira de Botânica","publisher":"FapUNIFESP (SciELO)","subject":"Plant Science","type":"journal-article","sha":"6d6d10587cc685f2f757c9fc55f43e7e36b9d5de"}
+{"doi":"10.1590/s0102-261x2007000500001","title":"Programa de Geologia e Geofísica Marinha do Brasil","authors":["Sidney L. M. Mello"],"year":"null","journal":"Revista Brasileira de Geofísica","publisher":"FapUNIFESP (SciELO)","subject":"Geophysics","type":"journal-article","sha":"67f3776f100a70bd3c39474053a6999f9f3df7ae"}
+{"doi":"10.1590/s0102-311x1994000800002","title":"Um modelo hierárquico de análise das variáveis sócio-econômicas e dos padrões de contatos com águas associados à forma hepatoesplênica da esquistossomose","authors":["Maria Fernanda Lima e Costa","Roberto S. Rocha","Maria Helena de A. Magalhães","Naftale Katz"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"33ed802d62c93c38888fe986e8eaca174740f343"}
+{"doi":"10.1590/s0102-311x1996000400007","title":"As implicações do conhecimento prático para a vigilância em saúde do trabalhador","authors":["Leny Sato"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"2d26fcdce614c53747f0a301aaef33eeb2ce9b34"}
+{"doi":"10.1590/s0102-311x2000000200009","title":"Ideologia e relações de gênero: um estudo de recepção das propagandas de prevenção da AIDS","authors":["Adriane Roso"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"ffd42084c06bdc643536bb6f53fe53e7b799df9e"}
+{"doi":"10.1590/s0102-311x2004000600018","title":"Fatores associados ao uso de medicamentos durante a gestação em seis cidades brasileiras","authors":["Sotero Serrate Mengue","Eloir Paulo Schenkel","Maria Inês Schmidt","Bruce B. Duncan"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"4ee5c296fa1e1e520e7706bbd463490efa093fb0"}
+{"doi":"10.1590/s0102-311x2007000900025","title":"Las enfermedades crónicas desde la mirada de los enfermos y los profesionales de la salud: un estudio cualitativo en México","authors":["Francisco J. Mercado-Martínez","Eduardo Hernández-Ibarra"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"7d5464c45ecff58ff1b29ddb7d32783ea3983ce3"}
+{"doi":"10.1590/s0102-311x2007001100020","title":"Saúde bucal no Programa Saúde da Família: uma avaliação do modelo assistencial","authors":["Tatyana Maria Silva de Souza","Angelo Giuseppe Roncalli"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"c7f583a38215a23db8d72f15bbc7c82659ebb89a"}
+{"doi":"10.1590/s0102-311x2013001200018","title":"Access to prenatal care: assessment of the adequacy of different indices","authors":["Edson Theodoro dos Santos Neto","Adauto Emmerich Oliveira","Eliana Zandonade","Maria do Carmo Leal"],"year":"null","journal":"Cadernos de Saúde Pública","publisher":"FapUNIFESP (SciELO)","subject":"Medicine(all)","type":"journal-article","sha":"b9381fa2aeeea9381ef0fae058951fb836f11aca"}
+{"doi":"10.1590/s0102-37722004000300005","title":"Resiliência e desenvolvimento infantil de crianças que cuidam de crianças: uma visão em perspectiva","authors":["Michele Poletto","Tânia Maria Cemin Wagner","Sílvia Helena Koller"],"year":"null","journal":"Psicologia: Teoria e Pesquisa","publisher":"FapUNIFESP (SciELO)","subject":"Psychology(all)","type":"journal-article","sha":"124fbaf6ebb5ca85aae5f58500fa398f57a158fb"}
+{"doi":"10.1590/s0102-47442007000300018","title":"Revolucionário e ainda assim desconhecido!","authors":["Peter M. Schuster"],"year":"null","journal":"Revista Brasileira de Ensino de Física","publisher":"FapUNIFESP (SciELO)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"2a732b5790204fab9e57808dd63968b89bca9948"}
+{"doi":"10.1590/s0103-18132008000100002","title":"Constraints to peer scaffolding","authors":["Marília Mendes Ferreira"],"year":"null","journal":"Trabalhos em Linguística Aplicada","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"e921b9d8cf1f8d3f5e72b2af67e1bad4fc1cef4e"}
+{"doi":"10.1590/s0103-18132008000200005","title":"Ich spreche anders, aber das ist auch deutsch: línguas em conflito em uma escola rural localizada em zona de imigração no sul do Brasil","authors":["Maristela Pereira Fritzen"],"year":"null","journal":"Trabalhos em Linguística Aplicada","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"6e39bd59f7852ea61d2325f56f52631d4233e5b5"}
+{"doi":"10.1590/s0103-50532011000200022","title":"Synthesis and antileishmanial activity of new 1-aryl-1H-pyrazole-4-carboximidamides derivatives","authors":["Maurício S. dos Santos","Adriana O Gomes","Alice M. R Bernardino","Marcos C. de Souza","Misbahul A Khan","Monique A. de Brito","Helena C Castro","Paula A Abreu","Carlos R Rodrigues","Rosa M. M. de Léo","Leonor L Leon","Marilene M Canto-Cavalheiro"],"year":"null","journal":"Journal of the Brazilian Chemical Society","publisher":"FapUNIFESP (SciELO)","subject":"Chemistry(all)","type":"journal-article","sha":"0fd34746fa5b0fe2fbd05e8542723ef2cae01cf9"}
+{"doi":"10.1590/s0103-636x2013000100013","title":"O corpo despido pelas práticas de desenhar: dos usos à disciplinarização do desenho","authors":["Rosilene Beatriz Machado","Cláudia Regina Flores"],"year":"null","journal":"Bolema: Boletim de Educação Matemática","publisher":"FapUNIFESP (SciELO)","subject":"Education","type":"journal-article","sha":"2f4233b6f7fda830910139e7126578d0e148fd39"}
+{"doi":"10.1590/s0103-64402012000500022","title":"Giant complex odontoma of the anterior mandible: report of case with long follow up","authors":["Pedro Henrique Rezende Spini","Túlio Humberto Spini","João Paulo Silva Servato","Paulo Rogério de Faria","Sérgio Vitorino Cardoso","Adriano Mota Loyola"],"year":"null","journal":"Brazilian Dental Journal","publisher":"FapUNIFESP (SciELO)","subject":"Dentistry(all)","type":"journal-article","sha":"2a4d96709aa4648c8a6c919c37c71d7d7693e5c0"}
+{"doi":"10.1590/s0103-65132005000200007","title":"Alocação de modelos de produtos a equipes de trabalhadores baseada em modelos de curvas de aprendizagem","authors":["Michel José Anzanello","Flávio Sanson Fogliatto"],"year":"null","journal":"Produção","publisher":"FapUNIFESP (SciELO)","subject":"Industrial and Manufacturing Engineering","type":"journal-article","sha":"af09e20b898ee6b048d427ffa41a9ae66986ff11"}
+{"doi":"10.1590/s0103-97331999000100014","title":"Thermal distributions in stellar plasmas, nuclear reactions and solar neutrinos","authors":["M. Coraddu","G. Kaniadakis","A. Lavagno","M. Lissia","G. Mezzorani","P. Quarati"],"year":"null","journal":"Brazilian Journal of Physics","publisher":"FapUNIFESP (SciELO)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"fcf81d95ffd2815318ee45f51636a1d1587df3be"}
+{"doi":"10.1590/s0103-97332000000200021","title":"Charge carrier mobility and electroluminescence in a green-emitting alternating block copolymer with a methoxy bi-substituted chromophore","authors":["Ma Dongge","I. A. Hümmelgen","Jing Xiabin","Wang Daike","Hong Zhiyong","Wang Lixiang","Zhao Xiaojiang","Wang Fosong"],"year":"null","journal":"Brazilian Journal of Physics","publisher":"FapUNIFESP (SciELO)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"4bf7894bd7d6e0e55cfe71ba97c718e936da737b"}
+{"doi":"10.1590/s0103-97332009000400029","title":"Note on semiclassical uncertainty relations","authors":["F. Olivares","F. Pennini","G. L Ferri","A. Plastino"],"year":"null","journal":"Brazilian Journal of Physics","publisher":"FapUNIFESP (SciELO)","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"b8e6f5e99bf750c97154b3d9205d38212958f144"}
+{"doi":"10.1590/s0104-07072009000200022","title":"Gerenciando uma experiência investigativa na promoção do \"viver saudável\" em um projeto de inclusão social","authors":["Alacoque Lorenzini Erdmann","Marli Terezinha Stein Backes","Dirce Stein Backes","Magda Santos Koerich","Maria Aparecida Baggio","Jacira Nunes Carvalho","Betina Hörner Schlindwein Meirelles"],"year":"null","journal":"Texto & Contexto - Enfermagem","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"acffcd37ed4aec736bbcc2ec15ed85be33a6dbad"}
+{"doi":"10.1590/s0104-14282007000200015","title":"Avaliação da aplicabilidade de técnicas MIC/FT-IR/DSC para a caracterização de filmes multicamadas","authors":["Luciano M. Nogueira","Rita C.L. Dutra","Milton F. Diniz","Marcia Pires","Mônica Evangelista","Fernanda A. Santana","Leandro Tomasi","Priscila dos Santos","Regina Nonemacher"],"year":"null","journal":"Polímeros","publisher":"FapUNIFESP (SciELO)","subject":"Organic Chemistry","type":"journal-article","sha":"cb4ab458c9af3538893c6ac6e067d69c9a451438"}
+{"doi":"10.1590/s0104-42302009000500008","title":"Entorse de tornozelo","authors":["Fábio Lucas Rodrigues","Gilberto Waisberg"],"year":"null","journal":"Revista da Associação Médica Brasileira","publisher":"Elsevier BV","subject":"Medicine(all)","type":"journal-article","sha":"f03610385d240c8a8c46b9e81ceed17d8bb82277"}
+{"doi":"10.1590/s0104-59701995000200015","title":"Missionaries of sciences: the Rockfeller Foundation and Latin America","authors":["Jaime Benchimol"],"year":"null","journal":"História, Ciências, Saúde-Manguinhos","publisher":"FapUNIFESP (SciELO)","subject":"History and Philosophy of Science","type":"journal-article","sha":"d3f1770cfad052d04ff579874da06f3762ca7e30"}
+{"doi":"10.1590/s0104-59702005000300026","title":"Popper e o espelho de Darwin","authors":["Maurício de Carvalho Ramos"],"year":"null","journal":"História, Ciências, Saúde-Manguinhos","publisher":"FapUNIFESP (SciELO)","subject":"History and Philosophy of Science","type":"journal-article","sha":"12666d9e8524d9b06e5385ad4c3a23a4ddc8671d"}
+{"doi":"10.1590/s0104-597020130003000018","title":"Os cientistas e seus arquivos","authors":["Celso Castro"],"year":"null","journal":"História, Ciências, Saúde-Manguinhos","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"48071801b0f600b27da976eba9b7126459063d79"}
+{"doi":"10.1590/s0104-92242011000100002","title":"Estudo da microestrutura formada no processo de soldagem por atrito em aço C-Mn com pino consumível","authors":["Cleber Rodrigo de Lima Lessa","Marcelo Freitas Caregnato","Pedro Henrique Costa Pereira da Cunha","Mariane Chludzinski","Telmo Roberto Strohaecker","Márcio Levi Kramer de Macedo","Marcelo Torres Piza Paes"],"year":"null","journal":"Soldagem & Inspeção","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"ef2dba85f589f11b67263097e66e7191ca2cdea0"}
+{"doi":"10.1590/s0373-55241983000200005","title":"Contribuição ao conhecimento da fauna de peixes do litoral norte do Estado de São Paulo","authors":["Edmundo Ferraz Nonato","A. Cecília Z Amaral","José Lima Figueiredo"],"year":"null","journal":"Boletim do Instituto Oceanográfico","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"b3a73c77f738d84acfd2efc76d604f17a12fcd68"}
+{"doi":"10.1590/s1413-41522007000100005","title":"Aplicação de modelo de simulação-otimização na gestão de perda de água em sistemas de abastecimento","authors":["Carlos César Gumier","Edevar Luvizotto Junior"],"year":"null","journal":"Engenharia Sanitaria e Ambiental","publisher":"FapUNIFESP (SciELO)","subject":"Waste Management and Disposal","type":"journal-article","sha":"82639edd392aec95427018f92c6f05a55dfdcb75"}
+{"doi":"10.1590/s1413-41522009000300014","title":"O emprego da análise de imagem na determinação da distribuição de tamanho de partículas da areia presente no esgoto sanitário","authors":["Gustavo Silva do Prado","José Roberto Campos"],"year":"null","journal":"Engenharia Sanitaria e Ambiental","publisher":"FapUNIFESP (SciELO)","subject":"Waste Management and Disposal","type":"journal-article","sha":"57791e4ca32ed25b0396eb56a5650536f62fe68e"}
+{"doi":"10.1590/s1413-99362008000100002","title":"Das bibliotecas convencionais às digitais: diferenças e convergências","authors":["Murilo Bastos da Cunha"],"year":"null","journal":"Perspectivas em Ciência da Informação","publisher":"FapUNIFESP (SciELO)","subject":"Communication","type":"journal-article","sha":"4dc949cc33ae646b66ed35bfe93b04127791b5d8"}
+{"doi":"10.1590/s1414-32832000000200011","title":"A avaliação transdisciplinar e poder: levantando algumas questões","authors":["Luiz Carlos de Oliveira Cecílio"],"year":"null","journal":"Interface - Comunicação, Saúde, Educação","publisher":"FapUNIFESP (SciELO)","subject":"Communication","type":"journal-article","sha":"ba766adacf904f5e7e94fa592558cc3c1f45f6fd"}
+{"doi":"10.1590/s1415-47572008005000009","title":"Autonomous growth of BALB/MK keratinocytes transfected with a retroviral vector carrying the human epidermal growth factor gene","authors":["Jomuna V. Choudhuri","Monica B. Mathor","Flávia H. Silva","Sang W. Han"],"year":"null","journal":"Genetics and Molecular Biology","publisher":"FapUNIFESP (SciELO)","subject":"Genetics","type":"journal-article","sha":"f4aaabcc5cb42d039230729f9df21cd975c277a9"}
+{"doi":"10.1590/s1415-54192007000100013","title":"Relação comercial do ortodontista brasileiro com o seu paciente, natureza obrigacional dos serviços prestados e riscos do tratamento ortodôntico","authors":["Elionai Dias Soares","Adriana Silva de Carvalho","Jurandir Antônio Barbosa"],"year":"null","journal":"Revista Dental Press de Ortodontia e Ortopedia Facial","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"9e68fd7f7a666d02c40b13a2546c16f62cec18b6"}
+{"doi":"10.1590/s1415-65552004000100015","title":"Organizações do conhecimento: infra-estrutura, pessoas e tecnologias","authors":["Aluízia Aparecida Cadori"],"year":"null","journal":"Revista de Administração Contemporânea","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"00cedef498463d76de3d5909795f5a8a57a7f70b"}
+{"doi":"10.1590/s1415-790x2005000100010","title":"Vigilância Sanitária: uma proposta de análise dos contextos locais","authors":["Márcia Franke Piovesan","Maria Valéria Vasconcelos Padrão","Maria Umbelina Dumont","Gracia Maria Gondim","Oviromar Flores","José Ivo Pedrosa","Luiz Felipe Moreira Lima"],"year":"null","journal":"Revista Brasileira de Epidemiologia","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"41cf1a86e323d448523bc0e89f87afe9cc77769a"}
+{"doi":"10.1590/s1516-89132008000300008","title":"Population structure and somatic indexes of Hypostomus cf. ancistroides (Siluriformes, Loricariidae) collected from the Bonito river, Ivaí river basin, Turvo, Paraná","authors":["Douglas Viana","Luciano Lazzarini Wollf","Tânia Zaleski","Silvia Romão","Gustavo Bertoldi","Lucélia Donatti"],"year":"null","journal":"Brazilian Archives of Biology and Technology","publisher":"FapUNIFESP (SciELO)","subject":"General","type":"journal-article","sha":"34c77a534fe795f55b949a6f4ce4a8e76ce9ac61"}
+{"doi":"10.1590/s1806-11172005000200015","title":"Bose e Einstein: do nascimento da estatística quântica à condensação sem interação II","authors":["Sílvio R. Dahmen"],"year":"null","journal":"Revista Brasileira de Ensino de Física","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"8f36171348932f1cda47f46004fb981954e9a164"}
+{"doi":"10.1590/s1984-46702009005000003","title":"Estudo dos vestígios de peixes dos sítios arqueológicos da área de influência da Usina Hidrelétrica Machadinho, Rio Grande do Sul, Brasil","authors":["Cláudio Ricken","Luiz Roberto Malabarba"],"year":"null","journal":"Zoologia (Curitiba)","publisher":"FapUNIFESP (SciELO)","subject":"","type":"journal-article","sha":"4e69fdb25d5beaa5e0f7276c26ee1188ce7fecc4"}
+{"doi":"10.1602/neurorx.2.3.471","title":"Animal models of Kennedy disease","authors":["Diane E. Merry"],"year":"2005","journal":"NeuroRX","publisher":"Springer Nature","subject":"Pharmacology (medical)","type":"journal-article","sha":"bd2c7a55293ed95cf347c2af9866d9b578264189"}
+{"doi":"10.1620/tjem.210.291","title":"Chloride-Dependent Intracellular pH Regulation via Extracellular Calcium-Sensing Receptor in the Medullary Thick Ascending Limb of the Mouse Kidney","authors":["Ulviyya Fizuli Aslanova","Tetsuji Morimoto","Elnur Ilham Farajov","Naonori Kumagai","Minako Nishino","Noriko Sugawara","Atsushi Ohsaga","Yoshio Maruyama","Shigeru Tsuchiya","Shori Takahashi","Yoshiaki Kondo"],"year":"2006","journal":"The Tohoku Journal of Experimental Medicine","publisher":"Tohoku University Medical Press","subject":"Medicine(all)","type":"journal-article","sha":"9b19af750904e12167d89965ea3925bf959af3b9"}
+{"doi":"10.17660/actahortic.2004.651.9","title":"OPTIMISATION OF RELATIVE QUANTITATIVE RT-PCR FOR EXPRESSION ANALYSIS IN AZALEA FLOWER COLOUR SPORTS","authors":["E. De Keyser","J. De Riek","E. Van Bockstaele"],"year":"2004","journal":"Acta Horticulturae","publisher":"International Society for Horticultural Science (ISHS)","subject":"Horticulture","type":"journal-article","sha":"af489ab8ee306a093e928ff68ac38018a598ff74"}
+{"doi":"10.18356/ccb877f3-en","title":"Abbreviations and acronyms","authors":[" "],"year":"2015","journal":"United Nations disarmament yearbook 2013: Part II","publisher":"United Nations Publications","subject":"","type":"book-chapter","sha":"655926e4479140734e9e03b4a57b73cb4e2ad852"}
+{"doi":"10.18637/jss.v016.i01","title":" Formulating State Space Models in R with Focus on Longitudinal Regression Models ","authors":["Claus Dethlefsen","Søren Lundbye-Christensen"],"year":"null","journal":"Journal of Statistical Software","publisher":"Foundation for Open Access Statistic","subject":"","type":"journal-article","sha":"38ae4a33cef875631f5260b4d59523972815d73b"}
+{"doi":"10.1890/03-4101","title":"ECOSYSTEM EFFECTS OF BIODIVERSITY MANIPULATIONS IN EUROPEAN GRASSLANDS","authors":["E. M. Spehn","A. Hector","J. Joshi","M. Scherer-Lorenzen","B. Schmid","E. Bazeley-White","C. Beierkuhnlein","M. C. Caldeira","M. Diemer","P. G. Dimitrakopoulos","J. A. Finn","H. Freitas","P. S. Giller","J. Good","R. Harris","P. Högberg","K. Huss-Danell","A. Jumpponen","J. Koricheva","P. W. Leadley","M. Loreau","A. Minns","C. P. H. Mulder","G. O'Donovan","S. J. Otway","C. Palmborg","J. S. Pereira","A. B. Pfisterer","A. Prinz","D. J. Read","E.-D. Schulze","A.-S. D. Siamantziouras","A. C. Terry","A. Y. Troumbis","F. I. Woodward","S. Yachi","J. H. Lawton"],"year":"2005","journal":"Ecological Monographs","publisher":"Wiley-Blackwell","subject":"Ecology, Evolution, Behavior and Systematics","type":"journal-article","sha":"f0edf27a49ed5806555d4c9b7284f4d69bdaa5c0"}
+{"doi":"10.2105/ajph.2007.114249","title":"Characteristics of Recipients of Free Prescription Drug Samples: A Nationally Representative Analysis","authors":["Sarah L. Cutrona","Steffie Woolhandler","Karen E. Lasser","David H. Bor","Danny McCormick","David U. Himmelstein"],"year":"2008","journal":"American Journal of Public Health","publisher":"American Public Health Association","subject":"Public Health, Environmental and Occupational Health","type":"journal-article","sha":"5b983fda31c588e62f9ae02546f882b047dc850d"}
+{"doi":"10.2116/analsci.29.143","title":"Characterization of Japanese Polished Rice by Stable Hydrogen Isotope Analysis of Total Fatty Acids for Tracing Regional Origin","authors":["Yaeko SUZUKI","Fumikazu AKAMATSU","Rumiko NAKASHITA","Takashi KORENAGA"],"year":"2013","journal":"Analytical Sciences","publisher":"Japan Society for Analytical Chemistry","subject":"Analytical Chemistry","type":"journal-article","sha":"7691e0a176891d6079176af20ecdf1c32ee97d81"}
+{"doi":"10.2139/ssrn.1112030","title":"Central Bank Independence and Transparency: Evolution and Effectiveness","authors":["Ellen E. Meade","Christopher Crowe"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"eb7962e0daf2935435dc19298f8dd0225dada6f1"}
+{"doi":"10.2139/ssrn.1338758","title":"Private Incentives to Innovate: Interplay of New Products and Brand-Name Reputation","authors":["Nina Leheyda"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"98b6b1c1fb262edb1dd3e3679593acb266b60f3a"}
+{"doi":"10.2139/ssrn.1406950","title":"Multi-Factor Gegenbauer Processes and European Inflation Rates","authors":["Guglielmo Maria Caporale","Luis A. Gil-Alana"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"167c256dba61a9800ca49786c532438d12e2e2fa"}
+{"doi":"10.2139/ssrn.1555660","title":"Fiduciary Duties and Equityâ€Debtholder Conflicts","authors":["Bo Becker","Per Johan Strömberg"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"7811cd858762ef9227dd81514191d85af970f1bf"}
+{"doi":"10.2139/ssrn.157835","title":"Trading Volume and Cross-Autocorrelations in Stock Returns","authors":["Tarun Chordia","Bhaskaran Swaminathan"],"year":"null","journal":"SSRN Electronic Journal ","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"1226761434526332c98927c364c78d3d452b0624"}
+{"doi":"10.2139/ssrn.1603822","title":"(Anti-) Coordination in Networks","authors":["Jaromir Kovarik","Friederike Mengel","J. Gabriel Romero"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"0fbb9c74cdb89dae0a88033d54d96b7fc31fe123"}
+{"doi":"10.2139/ssrn.1628104","title":"From Russia with Love: The Impact of Relocated Firms on Incumbent Survival","authors":["Oliver Falck","Christina Guenther","Stephan Heblich","William R. Kerr"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"2dfa559a1a261af0d55069ffe17a6835ac78abe7"}
+{"doi":"10.2139/ssrn.1943506","title":"Remittances and Financial Openness","authors":["Michel A. R. Beine","Elisabetta Lodigiani","Robert Vermeulen"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"11c2758cb463ffc56c21640f97c93428f00f2155"}
+{"doi":"10.2139/ssrn.1978767","title":"Strong Anonymity and Infinite Streams","authors":["Luc Lauwers"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"cd3752fc84f00134c7a613a2a1312fa504842753"}
+{"doi":"10.2139/ssrn.2254524","title":"An Overview of Agricultural Credit and Crop Insurance in Bihar","authors":["R. K. P. Singh","Krishna M. Singh"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"e1fd4b9c3d3502bd54fd9f4ad7d9f99c59faa2bc"}
+{"doi":"10.2139/ssrn.423360","title":"Does Education Really Disadvantage Women in the Marriage Market?","authors":["Elaina Rose"],"year":"null","journal":"SSRN Electronic Journal ","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"bde6fa6f143443c23f6be8c36c80adf5f78ad423"}
+{"doi":"10.2139/ssrn.423403","title":"Family Tax Splitting: A Microsimulation of its Potential Labour\\r\\nSupply and Intra-household Welfare Effects in Germany","authors":["Miriam Beblo","Denis Beninger","Francois Lasiney"],"year":"null","journal":"SSRN Electronic Journal ","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"22365b76518ff793e7d34b5e58a36b1770551e32"}
+{"doi":"10.2139/ssrn.951732","title":"Generalized Monotonicity Analysis","authors":["Bruno H. Strulovici","Thomas A. Weber"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"1bb70d82b0076844d84751e48949a2c1f3ff705a"}
+{"doi":"10.2139/ssrn.967633","title":"Non-Linear Growth Effects of Financial Development: Does Financial Integration Matter?","authors":["Igor Masten","Fabrizio Coricelli","Arjana Brezigar Masten"],"year":"null","journal":"SSRN Electronic Journal","publisher":"Elsevier BV","subject":"","type":"journal-article","sha":"4a09c2015bd42d1e04825294652045f6479e61a2"}
+{"doi":"10.2147/dddt.s87197","title":"Distinct prognostic values and potential drug targets of ALDH1 isoenzymes in non-small-cell lung cancer","authors":["Qinghua You","Dongxiang Xu","Huanchen Guo"],"year":"null","journal":"Drug Design, Development and Therapy","publisher":"Dove Medical Press Ltd.","subject":"Pharmacology","type":"journal-article","sha":"1b32d3c4dae5e34bbab7ecf9eaba8e60d0c9ff24"}
+{"doi":"10.2147/dmso.s20649","title":"Impact and cost of a 2-week community-based screening and awareness program for diabetes and cardiovascular risk factors in a Swiss canton","authors":["Pascal Bovet"," Hirsiger"," Emery"," De Bernardini"," Rossier"," Trebeljahr"," Hagon"],"year":"null","journal":"Diabetes, Metabolic Syndrome and Obesity: Targets and Therapy","publisher":"Dove Medical Press Ltd.","subject":"Internal Medicine","type":"journal-article","sha":"8aeef1e9453786a1b5d76b0c68e3ad4f2978ad16"}
+{"doi":"10.2147/dmso.s48260","title":"Akt/PKB activation and insulin signaling: a novel insulin signaling pathway in the treatment of type 2 diabetes","authors":["Richard Mackenzie","Bradley Elliott"],"year":"null","journal":"Diabetes, Metabolic Syndrome and Obesity: Targets and Therapy","publisher":"Dove Medical Press Ltd.","subject":"Internal Medicine","type":"journal-article","sha":"00be67b404fb88a1460bd4a5a2123055214a699d"}
+{"doi":"10.2147/ijn.s25251","title":"Novel docetaxel-loaded nanoparticles based on PCL-Tween 80 copolymer for cancer treatment","authors":["Lin Mei"," Zeng"," Cheng"," Zheng"," Song"," Huang"," Ma"],"year":"null","journal":"International Journal of Nanomedicine","publisher":"Dove Medical Press Ltd.","subject":"","type":"journal-article","sha":"52a33583d87c2dc19f47ed822012dad6ccc7c62e"}
+{"doi":"10.2147/ndt.s42628","title":"Balance and gait in older electroconvulsive therapy recipients: a pilot study","authors":["Chris Plakiotis"," Barson"," Vengadasalam"," Haines"," O'Connor"],"year":"null","journal":"Neuropsychiatric Disease and Treatment","publisher":"Dove Medical Press Ltd.","subject":"Biological Psychiatry","type":"journal-article","sha":"9f46700eea89690b68bf948b4eeb800c494ec39e"}
+{"doi":"10.2147/ott.s37407","title":"Dual control of Shuanghuang Shengbai granule on upstream and downstream signal modulators of CyclinD-CDK4/6 signaling pathway of cell cycle in Lewis-bearing mice with cyclophosphamide-induced myelosuppression","authors":["Zhen-ye Xu"," Xian-Gu","Ling Yu-Zhu","Li-Fang Wang"," Kai-Li"," Qiang-Pei"],"year":"null","journal":"OncoTargets and Therapy","publisher":"Dove Medical Press Ltd.","subject":"Pharmacology (medical)","type":"journal-article","sha":"2b17f9d7f6e34095fc73d186d743439bab5b7121"}
+{"doi":"10.2147/prbm.s38707","title":"Conceptual measurement framework for help-seeking for mental health problems","authors":["Debra Rickwood"," Thomas"],"year":"null","journal":"Psychology Research and Behavior Management","publisher":"Dove Medical Press Ltd.","subject":"Psychology(all)","type":"journal-article","sha":"dd53bba734be518f478576598a8c25068b5152c1"}
+{"doi":"10.2166/wp.2007.060","title":"PPP – policies, practices and problems in Ghana's urban water supply","authors":["Veronika Fuest","Stefan A. Haffner"],"year":"2007","journal":"Water Policy","publisher":"IWA Publishing","subject":"Geography, Planning and Development","type":"journal-article","sha":"eea6d1a429b2953ebe528e1bae93a9580e081bcf"}
+{"doi":"10.2172/10176244","title":"Hanford Radiological Protection Support Services annual report for 1993","authors":["M. Lyon","D.E. Bihl","J.J. Fix","T.J. Froelich","R.K. Piper","P.C. Olsen"],"year":"null","journal":"","publisher":"Office of Scientific and Technical Information (OSTI)","subject":"","type":"report","sha":"bbe632e7c1487dabdad631e9084ccb204f2e47f8"}
+{"doi":"10.2172/205215","title":"Links in a distributed database: Theory and implementation","authors":["N.T. Karonis","M.R. Kraimer"],"year":"null","journal":"","publisher":"Office of Scientific and Technical Information (OSTI)","subject":"","type":"report","sha":"8affbc8f2aa4504802548c967aa4014f641645e6"}
+{"doi":"10.2172/61112","title":"Operability test procedure for rotary mode core sampling system {number_sign}3","authors":["T.R. Farris","T.D. Jarecki"],"year":"null","journal":"","publisher":"Office of Scientific and Technical Information (OSTI)","subject":"","type":"report","sha":"e304d01cc502dbcb7b082b24ca0b846a80b4e6bc"}
+{"doi":"10.2174/0929867003374570","title":"Molecular Manipulation of G Protein coupled Receptors A New Avenue into Drug Discovery","authors":["M. Sautel","G. Milligan"],"year":"2000","journal":"Current Medicinal Chemistry","publisher":"Bentham Science Publishers Ltd.","subject":"Molecular Medicine","type":"journal-article","sha":"4ed658a94515ab988e31999eed9a0b75f0488acf"}
+{"doi":"10.2174/157340712799828205","title":"Monoterpenes of Salvia leucophylla","authors":["Atsushi Sakai"],"year":"2012","journal":"Current Bioactive Compounds","publisher":"Bentham Science Publishers Ltd.","subject":"Pharmacology, Toxicology and Pharmaceutics(all)","type":"journal-article","sha":"4f1b82447384c783c7472679b1c992968eac215f"}
+{"doi":"10.2215/cjn.06080809","title":"Understanding Sources of Dietary Phosphorus in the Treatment of Patients with Chronic Kidney Disease","authors":["K. Kalantar-Zadeh","L. Gutekunst","R. Mehrotra","C. P. Kovesdy","R. Bross","C. S. Shinaberger","N. Noori","R. Hirschberg","D. Benner","A. R. Nissenson","J. D. Kopple"],"year":"2010","journal":"Clinical Journal of the American Society of Nephrology","publisher":"American Society of Nephrology (ASN)","subject":"Medicine(all)","type":"journal-article","sha":"80626aacf94314dafccc8caab9b3659a035a521f"}
+{"doi":"10.2298/pan0701005a","title":"Does political instability lead to higher and more volatile inflation?: A panel data analysis","authors":["Ari Aisen","Francisco Veiga"],"year":"2007","journal":"Panoeconomicus","publisher":"National Library of Serbia","subject":"Economics, Econometrics and Finance(all)","type":"journal-article","sha":"adf0463c03c4ed0202f8d2f2aedeaaa3f966faab"}
+{"doi":"10.2320/matertrans.48.860","title":"High-Resolution Electron Microscopic Study on Atomic Arrangements at Growing Tips of Martensite Plates and a Nucleating Martensite in Fe-Ni-Mn and Fe-Cr-C Alloys","authors":["K. Ogawa","S. Kajiwara"],"year":"2007","journal":"MATERIALS TRANSACTIONS","publisher":"Japan Institute of Metals","subject":"Mechanical Engineering","type":"journal-article","sha":"b76dddd179920b0a64e4b6a2f4c0421f393546ec"}
+{"doi":"10.2320/matertrans.mra2008046","title":"Size-Dependence of Martensite Transformation Temperature of Yttria-Doped Zirconia and the Distribution of Nucleation Sites","authors":["Motozo Hayakawa","Masanori Tamaki"],"year":"2008","journal":"MATERIALS TRANSACTIONS","publisher":"Japan Institute of Metals","subject":"Mechanical Engineering","type":"journal-article","sha":"04ce43d20e39caffd9e73e844b413e202124f03d"}
+{"doi":"10.2337/diab.33.3.301","title":"Characterization of the Serum from a Patient with Insulin Resistance and Hypoglycemia: Evidence for Multiple Populations of Insulin Receptor Antibodies with Different Receptor Binding and Insulin-mimicking Activities","authors":["R. De Pirro","R. A. Roth","L. Rossetti","I. D. Goldfine"],"year":"1984","journal":"Diabetes","publisher":"American Diabetes Association","subject":"Internal Medicine","type":"journal-article","sha":"473b48562854741b770a8402e45dc41ba3b03b54"}
+{"doi":"10.2337/diacare.26.3.960","title":"Evidence for Associated Cutaneous Microangiopathy in Diabetic Patients With Neuropathic Foot Ulceration","authors":["N. Chabbert-Buffet","C. LeDevehat","T. Khodabandhelou","E. Allaire","J. P. Gaitz","L. Tribout","N. Abdoucheli-Baudot","M. Vayssairat"],"year":"2003","journal":"Diabetes Care","publisher":"American Diabetes Association","subject":"Internal Medicine","type":"journal-article","sha":"a83d363dea48ba9242d40ab1fe9ad0a82a55d41c"}
+{"doi":"10.2337/diacare.27.7.1851","title":"Has RoboCop Got Diabetes?","authors":["B. Berger","P. Burian","K. Nilsson","M. Karlen","E. Rylander"],"year":"2004","journal":"Diabetes Care","publisher":"American Diabetes Association","subject":"Internal Medicine","type":"journal-article","sha":"a9fa613b32d5fa892f19d529e31a649bba9153cf"}
+{"doi":"10.2478/v10103-012-0032-8","title":"Spatial Analysis of the Labour Market by Using Econometric Tools. The Case of Lower Silesia Region (Dolnośląskie Voivodship)","authors":["Elżbieta Litwińska"],"year":"2012","journal":"Comparative Economic Research","publisher":"Walter de Gruyter GmbH","subject":"","type":"journal-article","sha":"8e0bdf56e396d86a2b4e2615f92ce4d49219ce19"}
+{"doi":"10.2495/dman090211","title":"DMT – an integrated disaster management tool","authors":["M. Angermann","M. Khider","M. Frassl","M. Lichtenstern"],"year":"2009","journal":"Disaster Management and Human Health Risk","publisher":"WITPRESS LTD.","subject":"","type":"proceedings-article","sha":"b52c9e9a0fd0e044e8c8db4f9875e840ed619d60"}
+{"doi":"10.2498/cit.1001770","title":"Arabic Text Classification Framework Based on Latent Dirichlet Allocation","authors":["Mounir Zrigui","Rami Ayadi","Mourad Mars","Mohsen Maraoui"],"year":"2012","journal":"Journal of Computing and Information Technology","publisher":"Faculty of Electrical Engineering and Computing, Univ. of Zagreb","subject":"","type":"journal-article","sha":"fefb8d754de579a31c780343d9286cb16f317c51"}
+{"doi":"10.2514/6.2008-7355","title":"Stability and Control of Relative Equilibria of Three-Spacecraft Magnetically Tethered Systems","authors":["Islam Hussein","Anthony Bloch"],"year":"2008","journal":"AIAA/AAS Astrodynamics Specialist Conference and Exhibit","publisher":"American Institute of Aeronautics and Astronautics (AIAA)","subject":"","type":"proceedings-article","sha":"37ae761b978b1eda3dc513aec9fd6445d2ba1eb9"}
+{"doi":"10.2514/6.2009-955","title":"Field-Responsive Colloidal Suspensions in Microgravity","authors":["Eric Furst","Paula Vasquez","Eric Bennung","Michael Boyle","Malvika Ogale","Juan Agui","Donna Bohman","Charles Bunnell","Peggy Whitson"],"year":"2009","journal":"47th AIAA Aerospace Sciences Meeting including The New Horizons Forum and Aerospace Exposition","publisher":"American Institute of Aeronautics and Astronautics (AIAA)","subject":"","type":"proceedings-article","sha":"db704f16e53c143f0bfc6042ae017df399899cce"}
+{"doi":"10.2528/pierb08020103","title":"SHAPED BEAM PATTERN SYNTHESIS WITH NON-UNIFORM SAMPLE PHASES","authors":["Joaquim Amândio Rodrigues Azevedo"],"year":"2008","journal":"Progress In Electromagnetics Research B","publisher":"EMW Publishing","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"71fe419830e4e04db40c025b93becf20334324a6"}
+{"doi":"10.3115/v1/w14-4334","title":"A Demonstration of Dialogue Processing in SimSensei Kiosk","authors":["Fabrizio Morbini","David DeVault","Kallirroi Georgila","Ron Artstein","David Traum","Louis-Philippe Morency"],"year":"2014","journal":"Proceedings of the 15th Annual Meeting of the Special Interest Group on Discourse and Dialogue (SIGDIAL)","publisher":"Association for Computational Linguistics (ACL)","subject":"","type":"proceedings-article","sha":"ddd3f37f54d6fd70ef36f436628214d984362add"}
+{"doi":"10.3130/jaabe.2.b145","title":"Precedent and Progress of an Idea: Quadruple Building Block and the Schindler Shelter","authors":["Jin-Ho Park"],"year":"2003","journal":"Journal of Asian Architecture and Building Engineering","publisher":"Architectural Institute of Japan","subject":"Civil and Structural Engineering","type":"journal-article","sha":"b695c68a67b6767dfe04c7d004726860f7134675"}
+{"doi":"10.3324/haematol.13035","title":"Blocking the APRIL circuit enhances acute myeloid leukemia cell chemosensitivity","authors":["D. Bonci","M. Musumeci","V. Coppola","A. Addario","C. Conticello","M. Hahne","M. Gulisano","F. Grignani","R. De Maria"],"year":"2008","journal":"Haematologica","publisher":"Ferrata Storti Foundation (Haematologica)","subject":"Hematology","type":"journal-article","sha":"0d9a6c6c9501934e791339e54eef1c1ddc856ccd"}
+{"doi":"10.3324/haematol.2010.028977","title":"Integration of molecular and clinical data of 40 unrelated von Willebrand Disease families in a Spanish locus-specific mutation database: first release including 58 mutations","authors":["I. Corrales","L. Ramirez","J. Ayats","C. Altisent","R. Parra","F. Vidal"],"year":"2010","journal":"Haematologica","publisher":"Ferrata Storti Foundation (Haematologica)","subject":"Hematology","type":"journal-article","sha":"742f25de65b4c04cc83023cc0aa1f0d40a489367"}
+{"doi":"10.3367/ufne.0182.201207d.0748","title":"Physics news on the Internet (based on electronic preprints)","authors":["Yurii N Eroshenko"],"year":"2012","journal":"Physics-Uspekhi","publisher":"Uspekhi Fizicheskikh Nauk (UFN) Journal","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"e6f0b10cdfa9602d658f32f1d7542206508b8cc3"}
+{"doi":"10.3367/ufnr.0184.201403b.0231","title":"Ya.B. Zeldovich and equation of state problems for matter under extreme conditions","authors":["Vladimir E. Fortov","Igor V. Lomonosov"],"year":"2014","journal":"Uspekhi Fizicheskih Nauk","publisher":"Uspekhi Fizicheskikh Nauk (UFN) Journal","subject":"Physics and Astronomy(all)","type":"journal-article","sha":"cb369159788956668cb3240703217539eacaacb9"}
+{"doi":"10.3386/w11866","title":"A Dual Policy Paradox: Why Have Trade and Immigration Policies Always Differed in Labor-Scarce Economies","authors":["Timothy Hatton","Jeffrey Williamson"],"year":"null","journal":"","publisher":"National Bureau of Economic Research","subject":"","type":"report","sha":"9b41dc3e2df7e805ced3595dceb90a283d6adbfc"}
+{"doi":"10.3386/w14695","title":"The Economics and Psychology of Inequality and Human Development","authors":["Flavio Cunha","James Heckman"],"year":"null","journal":"","publisher":"National Bureau of Economic Research","subject":"","type":"report","sha":"71529e69d57922613b769342cf3b286b79674e10"}
+{"doi":"10.3386/w17553","title":"A Unified Theory of Firm Selection and Growth","authors":["Costas Arkolakis"],"year":"null","journal":"","publisher":"National Bureau of Economic Research","subject":"","type":"report","sha":"1cfe57825a9e5700a2c20913219cc873ed318979"}
+{"doi":"10.3386/w2076","title":"International Debt Service and Economic Growth: Some Simple Analytics","authors":["Feldstein Martin"],"year":"null","journal":"","publisher":"National Bureau of Economic Research","subject":"","type":"report","sha":"230ea2621a71b7fb9833a39e9a0ddd3c9f135b99"}
+{"doi":"10.3386/w5902","title":"The Influence of Income Tax Rules on Insurance Reserves","authors":["David Bradford","Kyle Logue"],"year":"null","journal":"","publisher":"National Bureau of Economic Research","subject":"","type":"report","sha":"33423950cccbc8f876ffcd72162e64b422666032"}
+{"doi":"10.3386/w9839","title":"Regime-Switching and the Estimation of Multifractal Processes","authors":["Laurent Calvet","Adlai Fisher"],"year":"null","journal":"","publisher":"National Bureau of Economic Research","subject":"","type":"report","sha":"c150bd00f0ef648f937ced7fb214f2b7e347006a"}
+{"doi":"10.3390/antiox3020339","title":"Extraction of Antioxidants from Borage (Borago officinalis L.) Leaves—Optimization by Response Surface Method and Application in Oil-in-Water Emulsions","authors":["Francisco Segovia","Bryshila Lupo","Sara Peiró","Michael Gordon","María Almajano"],"year":"null","journal":"Antioxidants","publisher":"MDPI AG","subject":"","type":"journal-article","sha":"a03179f585ea92d01e04443c73edce7300d65456"}
+{"doi":"10.3390/ijms15023319","title":"The Glutathione Peroxidase Gene Family in Thellungiella salsuginea: Genome-Wide Identification, Classification, and Gene and Protein Expression Analysis under Stress Conditions","authors":["Fei Gao","Jing Chen","Tingting Ma","Huayun Li","Ning Wang","Zhanglei Li","Zichen Zhang","Yijun Zhou"],"year":"null","journal":"International Journal of Molecular Sciences","publisher":"MDPI AG","subject":"Physical and Theoretical Chemistry","type":"journal-article","sha":"29e8deb17f69a23fc87ed679c243de41c9822a18"}
+{"doi":"10.3390/s7071091","title":"Comparative Study of Protein Immobilization Properties on Calixarene Monolayers","authors":["Hongxia Chen","Minsu Lee","Sungwook Choi","Jae-Ho Kim","Heung-Jin Choi","Sung-Hoon Kim","Jeabeom Lee","Kwangnak Koh"],"year":"2007","journal":"Sensors","publisher":"MDPI AG","subject":"Electrical and Electronic Engineering","type":"journal-article","sha":"0a7c995028ae1b4188ad3ec0a2669b8712d5cf0c"}
+{"doi":"10.3390/vaccines2040735","title":"Self-Amplifying Replicon RNA Vaccine Delivery to Dendritic Cells by Synthetic Nanoparticles","authors":["Kenneth McCullough","Panagiota Milona","Lisa Thomann-Harwood","Thomas Démoulins","Pavlos Englezou","Rolf Suter","Nicolas Ruggli"],"year":"null","journal":"Vaccines","publisher":"MDPI AG","subject":"","type":"journal-article","sha":"63ffbbc9d28a38e19b28668611c8e75ed4b644d2"}
+{"doi":"10.3758/bf03193116","title":"Danger and usefulness: An alternative framework for understanding rapid evaluation effects in perception?","authors":["Lee H. Wurm"],"year":"2007","journal":"Psychonomic Bulletin & Review","publisher":"Springer Nature","subject":"Experimental and Cognitive Psychology","type":"journal-article","sha":"282637f682d8bc98e0193b5b6382806495b4fe87"}
+{"doi":"10.3758/bf03210711","title":"Visual acceleration detection: Effect of sign and motion orientation","authors":["Jack B. Calderone","Mary K. Kaiser"],"year":"1989","journal":"Perception & Psychophysics","publisher":"Springer Nature","subject":"Experimental and Cognitive Psychology","type":"journal-article","sha":"12da41b40ff249d25fbb137f3b2f3de51eab595c"}
+{"doi":"10.3758/bf03328259","title":"Shape discrimination and transfer in the California sea lion","authors":["Ronald J. Schusterman","Thomas Thomas"],"year":"1966","journal":"Psychonomic Science","publisher":"Springer Nature","subject":"Arts and Humanities(all)","type":"journal-article","sha":"1023569cb1f068df6f70b547b2528bbfaf187649"}
+{"doi":"10.3758/bf03330896","title":"Authoritarianism and exposure to another’s behavior in a risk-taking situation","authors":["Robert A. Baron","Sidney J. Arenson"],"year":"1967","journal":"Psychonomic Science","publisher":"Springer Nature","subject":"Arts and Humanities(all)","type":"journal-article","sha":"7652b2c615ecd0da024c65b31fbb4c662308bd59"}
+{"doi":"10.3758/s13428-012-0304-z","title":"The semantic priming project","authors":["Keith A. Hutchison","David A. Balota","James H. Neely","Michael J. Cortese","Emily R. Cohen-Shikora","Chi-Shing Tse","Melvin J. Yap","Jesse J. Bengson","Dale Niemeyer","Erin Buchanan"],"year":"2013","journal":"Behavior Research Methods","publisher":"Springer Nature","subject":"Psychology (miscellaneous)","type":"journal-article","sha":"c44e3aa9b8e318e61c47e706f5545334ef399545"}
+{"doi":"10.3762/bjoc.6.25","title":"Efficient and improved synthesis of Telmisartan","authors":["A Sanjeev Kumar","Samir Ghosh","G N Mehta"],"year":"null","journal":"Beilstein Journal of Organic Chemistry","publisher":"Beilstein Institut","subject":"Organic Chemistry","type":"journal-article","sha":"c61bc681c7932518eb0f66e159a5af6f995b1fd2"}
+{"doi":"10.3807/josk.2007.11.3.108","title":"3D Nano Object Recognition based on Phase Measurement Technique","authors":["Dae-Suk Kim","Byung-Joon Baek","Young-Dong Kim","Bahram Javidi"],"year":"2007","journal":"Journal of the Optical Society of Korea","publisher":"Korean Journal of Optics and Photonics","subject":"Atomic and Molecular Physics, and Optics","type":"journal-article","sha":"721a1dbddb2f0af175cd2e6e8f47649b63ff88a7"}
+{"doi":"10.3852/mycologia.98.6.917","title":"A phylogenetic overview of the Agaricomycotina","authors":["D. S. Hibbett"],"year":"2006","journal":"Mycologia","publisher":"Informa UK Limited","subject":"Plant Science","type":"journal-article","sha":"c7d5f924eb6fd54fd79d7e97eee0d4fa5a3450bb"}
+{"doi":"10.3897/zookeys.440.7891","title":"The deep phylogeny of jumping spiders (Araneae, Salticidae)","authors":["Wayne Maddison","Daiqin Li","Melissa Bodner","Junxia Zhang","Xu Xin","Qinqing Liu","Fengxiang Liu"],"year":"null","journal":"ZooKeys","publisher":"Pensoft Publishers","subject":"Animal Science and Zoology","type":"journal-article","sha":"46a58d6350c445f22d30637f3e8cb60e1fafebc3"}
+{"doi":"10.3899/jrheum.120964","title":"Clinical, Functional, and Radiographic Benefits of Longterm Adalimumab Plus Methotrexate: Final 10-year Data in Longstanding Rheumatoid Arthritis","authors":["E. C. Keystone","D. van der Heijde","A. Kavanaugh","H. Kupper","S. Liu","B. Guerette","N. Mozaffarian"],"year":"2013","journal":"The Journal of Rheumatology","publisher":"The Journal of Rheumatology","subject":"Immunology","type":"journal-article","sha":"262e2dd6d02ed65774db817a11543e1162f3ebe1"}
+{"doi":"10.3928/01477447-20130426-04","title":"Operative Decisions for Endoscopic Treatment of Cubital Tunnel Syndrome","authors":["Hans-Georg Damert","Silke Altmann","Manfred Infanger","Armin Kraus"],"year":"2013","journal":"Orthopedics","publisher":"SLACK, Inc.","subject":"Orthopedics and Sports Medicine","type":"journal-article","sha":"971b7185c1de1b6cc7eba9c7ca3f7ff9ea2e8245"}
+{"doi":"10.3987/com-13-s(s)5","title":"Palladium-Catalyzed Tetraarylation of 5,15-Dialkylporphyrins with Aryl Bromides","authors":["Atsuhiro Osuka","Hideki Yorimitsu","Yutaro Yamamoto","Sumito Tokuji","Takayuki Tanaka"],"year":"2014","journal":"HETEROCYCLES","publisher":"Japan Institute of Heterocyclic Chemistry","subject":"Organic Chemistry","type":"journal-article","sha":"3ba3f231382152a1365e861f1ba7efba44b6febf"}
+{"doi":"10.4049/jimmunol.1003952","title":"Silencing MicroRNA-155 Ameliorates Experimental Autoimmune Encephalomyelitis","authors":["G. Murugaiyan","V. Beynon","A. Mittal","N. Joller","H. L. Weiner"],"year":"2011","journal":"The Journal of Immunology","publisher":"The American Association of Immunologists","subject":"Medicine(all)","type":"journal-article","sha":"dd390b175a80970781447a5907aea75a4ec1d47b"}
+{"doi":"10.4049/jimmunol.165.11.6116","title":"Absence of Functional Inducible NO Synthase Enhances the Efficacy of Tolerance Induced by High Dose Antigen Feeding","authors":["D. A. Kahn","D. C. Archer","C. J. Kelly"],"year":"2000","journal":"The Journal of Immunology","publisher":"The American Association of Immunologists","subject":"Medicine(all)","type":"journal-article","sha":"b69cab3c484a5dbfb145d13f650b0a9e33f635c9"}
+{"doi":"10.4049/jimmunol.179.7.4580","title":"Neonatal FcR Expression in Bone Marrow-Derived Cells Functions to Protect Serum IgG from Catabolism","authors":["S. Akilesh","G. J. Christianson","D. C. Roopenian","A. S. Shaw"],"year":"2007","journal":"The Journal of Immunology","publisher":"The American Association of Immunologists","subject":"Medicine(all)","type":"journal-article","sha":"6e2a7e0dee0c743b30cf43bbe7f2744fb6890be5"}
+{"doi":"10.4049/jimmunol.180.4.2240","title":"Comprehensive Assessment and Mathematical Modeling of T Cell Population Dynamics and Homeostasis","authors":["V. Thomas-Vaslin","H. K. Altes","R. J. de Boer","D. Klatzmann"],"year":"2008","journal":"The Journal of Immunology","publisher":"The American Association of Immunologists","subject":"Medicine(all)","type":"journal-article","sha":"6fb59f68b41cfa9c747f983dce4f2e195947fece"}
+{"doi":"10.4064/ap87-0-2","title":"Clarke critical values of subanalytic Lipschitz continuous functions","authors":["Jérôme Bolte","Aris Daniilidis","Adrian Lewis","Masahiro Shiota"],"year":"null","journal":"Annales Polonici Mathematici","publisher":"Institute of Mathematics, Polish Academy of Sciences","subject":"","type":"journal-article","sha":"b038f267e7fd92dc2bdb34b31c469530e90e5ab7"}
+{"doi":"10.4159/harvard.9780674062900.c9","title":"Securing Human Rights Intellectually: Philosophical Inquiries about the Universal Declaration","authors":["Mathias Risse"],"year":"null","journal":"The Harvard Sampler","publisher":"Harvard University Press","subject":"","type":"book-chapter","sha":"c75b4bfa53a31ded383019c5fa173de00718206f"}
+{"doi":"10.4161/cc.10.2.14551","title":"KAP1 depletion increases PML nuclear body number in concert with ultrastructural changes in chromatin","authors":["Rosemarie Kepkay","Kathleen M. Attwood","Yael Ziv","Yosef Shiloh","Graham Dellaire"],"year":"2011","journal":"Cell Cycle","publisher":"Informa UK Limited","subject":"Developmental Biology","type":"journal-article","sha":"ba6c9216f35c47eb7e017d9628987fa6f71e8c89"}
+{"doi":"10.4169/000298910x476059","title":"Nemirovski's Inequalities Revisited","authors":[" Lutz Dümbgen"," Sara A. van de Geer"," Mark C. Veraar"," Jon A. Wellner"],"year":"2010","journal":"The American Mathematical Monthly","publisher":"Mathematical Association of America","subject":"Mathematics(all)","type":"journal-article","sha":"2ffbaf317f0d7fa3813721756f82ea7e47555af1"}
+{"doi":"10.4187/respcare.01306","title":"The Future of Exogenous Surfactant Therapy","authors":["D. F. Willson","R. H. Notter"],"year":"2011","journal":"Respiratory Care","publisher":"Daedalus Enterprises","subject":"Critical Care and Intensive Care Medicine","type":"journal-article","sha":"68070afa193b9e0715a47ad538e4feb03aeee560"}
+{"doi":"10.4187/respcare.01709","title":"Evaluation of Interpretation Strategies and Significant Bronchodilator Response in Pediatric Patients With Normal Baseline Spirometry","authors":["Daniel P Hsu","Thad F Ocampo","Heather A DiGiovanni","Eddie Gil"],"year":"2012","journal":"Respiratory Care","publisher":"Daedalus Enterprises","subject":"Critical Care and Intensive Care Medicine","type":"journal-article","sha":"f063aab132accd9a22d75fc46e6f03b85e834c9a"}
+{"doi":"10.4321/s1130-01082011000700009","title":"Gastric splenosis: a rare cause of digestive bleeding","authors":["Bruno Arroja","Nuno Almeida","Charl Rafael Macedo","Ana Paula Moreira","Pedro Oliveira","Luis Tomé"," Hermano Gouveia","Carlos Sofia"],"year":"null","journal":"Revista Española de Enfermedades Digestivas","publisher":"Instituto de Salud Carlos III/BNCS/SciELO Espana","subject":"Gastroenterology","type":"journal-article","sha":"4481baee7f668a0c6524516d788c27fbd7bf4e0d"}
+{"doi":"10.4321/s1134-80462013000300003","title":"Dolor como factor predictor de depresión en el paciente oncológico: estudio de casos y controles. Estudio D-PRESS","authors":["J. Carulla","C. Jara","J. Sanz","C. Martínez","F. Ledesma","E. Zubillaga"],"year":"null","journal":"Revista de la Sociedad Española del Dolor","publisher":"Instituto de Salud Carlos III/BNCS/SciELO Espana","subject":"Anesthesiology and Pain Medicine","type":"journal-article","sha":"2a7b91aec818987571d3758727c1ccd2ddf055f3"}
+{"doi":"10.5012/bkcs.2012.33.10.3423","title":"Dielectric Study of Allyl Chloride with 2-Pentanone and 2-Hexanone in Microwave Frequency Range","authors":["Yuvraj Sudake","Siddharth Kamble","Aruna Maharolkar","Sunil Patil","Prakash Khirade","Suresh Mehrotra"],"year":"2012","journal":"Bulletin of the Korean Chemical Society","publisher":"Korean Chemical Society","subject":"Chemistry(all)","type":"journal-article","sha":"ed5dd319d8aba8b24bd58fe1ea9768c60d6cf7b9"}
+{"doi":"10.5012/jkcs.2010.54.01.038","title":"Analysis of Matrine Alkaloids in Human Urine by Hollow Fiber Liquid-phase Microextraction with High-performance Liquid Chromatography","authors":["Dan-Dan Han","Kyung-Ho Row"],"year":"2010","journal":"Journal of the Korean Chemical Society","publisher":"Korean Chemical Society","subject":"Chemical Engineering (miscellaneous)","type":"journal-article","sha":"2f5b84ff72bf2b1c92d3f3e439d6f5722c6b179c"}
+{"doi":"10.5089/9781451861693.001","title":"Infrastructure Aid, Deindustrialization, and Welfare","authors":["Eun Kwan Choi"," "],"year":"2005","journal":"IMF Working Papers","publisher":"International Monetary Fund","subject":"","type":"journal-article","sha":"e4cbf84f9908178e597455e3202f9ffe7c298ca0"}
+{"doi":"10.5120/11521-7403","title":"Electrifying Roller-Coaster Ride through Speed Breakers","authors":["Amod KumarPandey","Somya Yadav","Tanu Srivastava"],"year":"null","journal":"International Journal of Computer Applications","publisher":"Foundation of Computer Science","subject":"","type":"journal-article","sha":"e865f4f65c20b044e79fcb08c86a124f4c4dc127"}
+{"doi":"10.5120/15282-3880","title":"Privacy Preserving Techniques in Social Networks Data Publishing - A Review","authors":["Amardeep Singh","Divya Bansal","Sanjeev Sofat"],"year":"null","journal":"International Journal of Computer Applications","publisher":"Foundation of Computer Science","subject":"","type":"journal-article","sha":"2daefa6ea16aadf6405a1656af4b088c05c85a3b"}
+{"doi":"10.5120/1529-132","title":"Performance Enhanced Optimization based Image Retrieval System","authors":["Tessy Annie Varghese"],"year":"null","journal":"International Journal of Computer Applications","publisher":"Foundation of Computer Science","subject":"","type":"journal-article","sha":"45c8e3e851fb51c4b8d8741c9d5cda26bec538fb"}
+{"doi":"10.5120/17090-7624","title":"Using Online Hotel Customer Reviews to Improve the Booking Process","authors":["Wojoud Al-Abdullatif","Yasser Kotb"],"year":"null","journal":"International Journal of Computer Applications","publisher":"Foundation of Computer Science","subject":"","type":"journal-article","sha":"7f171be882e37850240a9759ce42172e2a51799a"}
+{"doi":"10.5120/18901-0192","title":"Evaluating the Failures of Data Centers in Cloud Computing","authors":["Preeti Gupta","Chaahat Gupta"],"year":"null","journal":"International Journal of Computer Applications","publisher":"Foundation of Computer Science","subject":"","type":"journal-article","sha":"a23c64ab75e81523f92d417c8d2cab7996d7a3a1"}
+{"doi":"10.5120/7897-1240","title":"Multiple Output Complex Instruction Matching Algorithm for Extensible Processors","authors":["Puneet Goyal","Narayan Chaturvedi"],"year":"null","journal":"International Journal of Computer Applications","publisher":"Foundation of Computer Science","subject":"","type":"journal-article","sha":"09527b517cfbefa52285275f1209368bb32e6016"}
+{"doi":"10.5121/csit.2014.4727","title":"Real-Time Detection of Phishing Tweets","authors":["Nilesh Sharma","Nishant Sharma","Vishakha Tiwari","Shweta Chahar","Smriti Maheshwari"],"year":"2014","journal":"Computer Science & Information Technology ( CS & IT )","publisher":"Academy and Industry Research Collaboration Center (AIRCC)","subject":"","type":"proceedings-article","sha":"bb56465e9b41be35b0bece076ba2a7661450f5d4"}
+{"doi":"10.5194/gmd-4-357-2011","title":"A multi-resolution assessment of the Community Multiscale Air Quality (CMAQ) model v4.7 wet deposition estimates for 2002–2006","authors":["K. W. Appel","K. M. Foley","J. O. Bash","R. W. Pinder","R. L. Dennis","D. J. Allen","K. Pickering"],"year":"null","journal":"Geoscientific Model Development","publisher":"Copernicus GmbH","subject":"","type":"journal-article","sha":"35fc6ebd65003bb8375f8438b39e8babacbb3ae8"}
+{"doi":"10.5296/ijld.v3i3.3861","title":"A Political Economy of the African School as a Learning Organization","authors":["AOK Noah"],"year":"null","journal":"International Journal of Learning and Development","publisher":"Macrothink Institute, Inc.","subject":"","type":"journal-article","sha":"23eaa72bd4543d04262334852b6f287f519438cd"}
+{"doi":"10.5387/fms.52.13","title":"A CASE OF HEPATIC ANGIOSARCOMA SUPPLIED BY BOTH HEPATIC ARTERY AND PORTAL VEIN","authors":["NAMIKO HOSHI","SHINJI MUKAI","MIYUKI OISHI","MAKOTO TAKANO","JOTARO SHINZAWA","SHIGERU WATANABE","SHIGERU YAMAZAKI","HIDEO SAKUMA","HIROMASA OHIRA","KATSUTOSHI OBARA","REIJI KASUKAWA","YUKIO SATO"],"year":"2006","journal":"FUKUSHIMA JOURNAL OF MEDICAL SCIENCE","publisher":"The Fukushima Society of Medical Science","subject":"Medicine(all)","type":"journal-article","sha":"e70a1423e9896c9c290d76334b0f24bf8b283cf5"}
+{"doi":"10.5539/jas.v5n7p167","title":"Essential Oils and Latices as Novel Antiviral Agent Against Potato Leaf Roll Virus and Analysis of Their Phytochemical Constituents Responsible for Antiviral Activity","authors":["Sehrish Iftikhar","Ahmad Ali Shahid","Shabnam Javed","Idrees Ahmad Nasir","Bushra Tabassum","M. Saleem Haider"],"year":"null","journal":"Journal of Agricultural Science","publisher":"Canadian Center of Science and Education","subject":"","type":"journal-article","sha":"d6b0e6d8ffdc2e792eec27198989dc1261016887"}
+{"doi":"10.5772/14899","title":"Biomimetic Fiber-Reinforced Compound Materials","authors":["Tom Masselter","Thomas Speck"],"year":"null","journal":"Advances in Biomimetics","publisher":"InTech","subject":"","type":"book-chapter","sha":"d95e221c6ced102ab185ac0099dfd1cc20d1d486"}
+{"doi":"10.5772/17726","title":"Integral Sliding-Based Robust Control","authors":["Chieh-Chuan Feng"],"year":"null","journal":"Recent Advances in Robust Control - Novel Approaches and Design Methods","publisher":"InTech","subject":"","type":"book-chapter","sha":"53b8aa13cc1854ead919b8019148d8c79443ed52"}
+{"doi":"10.5772/21591","title":"What are Authentic Pharmaceuticals Worth?","authors":["Matthieu Schapranow","Jurgen Muller","Martin Lorenz","Alexander Zeier","Hasso Plattner"],"year":"null","journal":"Designing and Deploying RFID Applications","publisher":"InTech","subject":"","type":"book-chapter","sha":"8fd9e7d8939f13b2c1d2863a80349f2b4d37f5f7"}
+{"doi":"10.5772/5750","title":"Fault-Tolerant Robot Programming through Simulation with Realistic Sensor Models","authors":["Thomas Bräunl","Andreas Koestler","Axel Waggershauser"],"year":"2006","journal":"International Journal of Advanced Robotic Systems","publisher":"SAGE Publications","subject":"","type":"journal-article","sha":"d7862185db4df3b54ed91e897ce498e25235dd8c"}
+{"doi":"10.5944/endoxa.3.1994.4803","title":"Propiedades teleológicas y supervivencia","authors":["Manuel Pérez Otero"],"year":"null","journal":"ENDOXA","publisher":"UNED - Universidad Nacional de Educacion a Distancia","subject":"","type":"journal-article","sha":"ed61f953bdc66b398476ffa5b5538ba3a08c7629"}
+{"doi":"10.5944/rllcgv.vol.17.2012.6030","title":"Novas sobre Ãlvaro Cunqueiro na Galicia de posguerra : Actividades na Asociación Cultural Iberoamericana de A Coruña desde 1951","authors":["Olivia Rodríguez González"],"year":"null","journal":"Revista de lenguas y literaturas catalana, gallega y vasca","publisher":"UNED - Universidad Nacional de Educacion a Distancia","subject":"","type":"journal-article","sha":"06172b6d2cc713cb64f20d5ddb75cb542e101feb"}
+{"doi":"10.5944/signa.vol24.2015.14723","title":"RESEÑA de: Díaz Navarro, Epicteto. En torno a la novela histórica española. Ecos, disidencias y parodias. Madrid: Ediciones del Orto, 2013.","authors":["Rosa Navarro Romero"],"year":"null","journal":"Signa: Revista de la Asociación Española de Semiótica","publisher":"UNED - Universidad Nacional de Educacion a Distancia","subject":"Linguistics and Language","type":"journal-article","sha":"062f90fd3284b2ad89fd7165fe2e12b8ea08ef4c"}
+{"doi":"10.7150/ijms.9027","title":"Laparoscopic Reconstructive Surgery is Superior to Vaginal Reconstruction in the Pelvic Organ Prolapse","authors":["Young-Han Park","Seong Cheon Yang","Sung Taek Park","Sung Ho Park","Hong Bae Kim"],"year":"2014","journal":"International Journal of Medical Sciences","publisher":"Ivyspring International Publisher","subject":"Medicine(all)","type":"journal-article","sha":"0a4f139dde800dc8c45dabbe026f058e56151aa9"}
+{"doi":"10.7155/jgaa.00109","title":"Compact Routing Schemes for Generalised Chordal Graphs","authors":["Yon Dourisboure"],"year":"null","journal":"Journal of Graph Algorithms and Applications","publisher":"Journal of Graph Algorithms and Applications","subject":"Theoretical Computer Science","type":"journal-article","sha":"50bcd025aa92bf20642fdb4e969ce2ed15a63898"}
+{"doi":"10.7591/cornell/9780801452208.003.0004","title":"Meaning, Truth, and Phenomenology","authors":["Mark Bevir"],"year":"2013","journal":"Presence","publisher":"Cornell University Press","subject":"","type":"book-chapter","sha":"ff1d6c24f945e6ff8dabf87d34f03a2d9f008bbb"}
diff --git a/match_test_data/grobid_sample.bibjson b/match_test_data/grobid_sample.bibjson
new file mode 100644
index 0000000..c451642
--- /dev/null
+++ b/match_test_data/grobid_sample.bibjson
@@ -0,0 +1,979 @@
+{"title": "pH influence on oxygen mass transfer coefficient in a bubble column. Individual characterization of kL and a", "sha": "00b30086409bb9c5deb1d92d60b5be641e4688bf", "authors": ["A Ferreira", "P Cardoso", "J Teixeira", "F Rocha"], "doi": "10.1016/j.ces.2013.02.020", "journal": "Chemical Engineering Science"}
+{"title": "Akt/PKB activation and insulin signaling: a novel insulin signaling pathway in the treatment of type 2 diabetes", "sha": "00be67b404fb88a1460bd4a5a2123055214a699d", "authors": ["Richard Mackenzie", "Bradley Elliott"], "doi": "10.2147/dmso.s48260", "journal": "Diabetes, Metabolic Syndrome and Obesity: Targets and Therapy"}
+{"title": "Notas Bibliogr\u00e1ficas", "sha": "00cedef498463d76de3d5909795f5a8a57a7f70b", "authors": [], "doi": null, "journal": null}
+{"title": "Strongly Coupled Artificial Bulk HTS Grain Boundaries With High Critical Current Densities", "sha": "01072d300e511ca28ca45dd52c9b6b80c81d0cbd", "authors": ["N Babu", "T Withnell", "K Iida", "D Cardwell"], "doi": "10.1109/tasc.2007.899063", "journal": "IEEE Transactions on Applied Superconductivity"}
+{"title": "Throughput Performance of Transport-Layer Protocols over Wireless LANs", "sha": "0168e796c1d46e8ac24994653cefe33234d0c3b2", "authors": ["Desiiiioiie 4ntonio", "Choo Mooi", "On-Ching Chuah"], "doi": null, "journal": null}
+{"title": "Semantic dementia: relevance to connectionist models of long-term memory", "sha": "01d676f052bedae39018ff45c3d3b908f3bb8618", "authors": ["Jaap Murre", "Kim Graham", "John Hodges"], "doi": null, "journal": "Brain"}
+{"title": "Effects of Elevated Temperature on Tunable Near-Zero Threshold CMOS \u00dc \u00bf\u00be\u00b9\u00b9\u00b9\u00d8 \u00d1\u00d9\u00d0\u00d8\u00d8\u00d4\u00d0\u00d0\u00d0\u00d6 \u00d3\u00d4\u00d4\u00d6\u00d6\u00d8\u00d8\u00d2\u00d2 \u00d8 \u00bd\u00bc\u00bc AE \u00ba \u00d3\u00d1\u00d4\u00d4\u00d6\u00d6\u00d6 \u00d8\u00d3 \u00d3\u00d4\u00d4\u00d6\u00d6\u00d8\u00d8\u00d3\u00d2 \u00d2\u00d8 \u00be\u00be AE\u00b8\u00d4\u00d4\u00d6\u00d6\u00d3\u00d6\u00d1\u00d1\u00d2\u00d2\u00d2AE\u00b8AE\u00b8\u00d4\u00d4\u00d6\u00d6\u00d3\u00d6\u00d1\u00d1\u00d2\u00d2\u00d2 \u00d2\u00d8 \u00ce \u00be\u00ba\u00bc \u00ce \u00d6\u00d6\u00d6\u00d6\u00d7 \u00bd\u00bd \u00d4\u00d4\u00d6\u00d6\u00d6\u00d2\u00d8 \u00d6\u00d3\u00d1 \u00bd\u00bd\u00bd\u00c5\u00c0\u00de \u00d8\u00d3 \u00bd\u00bd\u00be\u00c5\u00c0\u00de\u00ba \u00d8 \u00d0\u00d3\u00db\u00d6 \u00d7\u00d9\u00d4\u00d4\u00d0\u00dd \u00da\u00d3\u00d0\u00d8\u00d8\u00d8\u00d8\u00d7\u00b8\u00d7\u00da\u00d3\u00d0\u00d8\u00d8\u00d8\u00d8\u00d7\u00b8\u00d7 \u00d7 \u00d9\u00d7\u00d8\u00d8\u00d8 \u00d8\u00d3 \u00d1\u00d1\u00d2\u00d2\u00d1\u00d1\u00de\u00de \u00d4\u00d3\u00db\u00d6 \u00d7\u00d7\u00d7\u00d4\u00d4\u00d8\u00d8\u00d3\u00d2 \u00d2\u00d7 \u00d7 \u00d7\u00d9\u00d2\u00d2\u00d8\u00d8\u00d3\u00d2 \u00d3\u00d3 \u00d3\u00d4\u00d4\u00d6\u00d6\u00d8\u00d8\u00d2\u00d2 \u00d2\u00d6\u00d6\u00d5\u00d9\u00d9\u00d2\u00d2\u00dd \u00d7\u00d7\u00d1\u00d1\u00d0\u00d0\u00d6\u00d0\u00dd \u00d8\u00d3 \u00db\u00db\u00db\u00d8 \u00db \u00d6\u00d6\u00d4\u00d3\u00d6\u00d8\u00d8\u00d8 \u00d0\u00d0\u00d7\u00d8 \u00dd\u00d6 \u00d6\u00d8 \u00be\u00be AE \u00ba \u00ba\u00d3\u00d1\u00d4\u00d4\u00d6\u00d6\u00d2\u00d2 \u00d8\u00d8\u00d8 \u00d3\u00d4\u00d4\u00d6\u00d6\u00d8\u00b9 \u00d2\u00d2 \u00d4\u00d3\u00d3\u00d2\u00d8\u00d7\u00b8\u00d8\u00d8\u00d8\u00d4\u00d3\u00d3\u00d2\u00d8\u00d7\u00b8\u00d8\u00d8\u00d8 \u00d7\u00d7\u00d1\u00d1 \u00d4\u00d4\u00d6\u00d6\u00d3\u00d6\u00d1\u00d1\u00d2\u00d2\u00d2 \u00d2\u00d8 \u00bd\u00bc\u00bc AE \u00d6\u00d6\u00d5\u00d9\u00d9\u00d6\u00d6\u00d7 \u00d7\u00d7\u00d3\u00d9\u00d8 \u00bd\u00ba\u00ba \u00d8\u00d8\u00d1\u00d1\u00d7 \u00d8\u00d8\u00d8 \u00d4\u00d3\u00db\u00d6 \u00d1\u00d1\u00d1\u00d7\u00d9\u00d6\u00d6\u00d6 \u00d6\u00d8 \u00be\u00be AE \u00ba \u00c1\u00d8 \u00d8\u00d0\u00d7\u00d3 \u00d6\u00d6\u00d5\u00d9\u00d9\u00d6\u00d6\u00d7 \u00d7\u00d7\u00d3\u00d9\u00d8 \u00bd\u00ba\u00be \u00ce \u00ce\u00ce\u00ce\u00ce\u00d8\u00d8\u00d3\u00d2\u00d2\u00d0 \u00d0\u00d0\u00d0 \u00d7 \u00d7\u00d2\u00d2 \u00d2\u00d2\u00d3\u00d9\u00d8 \u00d8 \u00be\u00bc \u00d4\u00d4\u00d6\u00d6\u00d6\u00d2\u00d8 \u00d8\u00d2\u00d2\u00d6\u00d6\u00d6\u00d7\u00d7 \u00d2 \u00ce \u00ba \u00cc\u00cc\u00cc \u00d6\u00d6\u00d6\u00d8\u00d8\u00d3\u00d2 \u00d3\u00d3 \u00d8\u00d3\u00d8\u00d8\u00d0 \u00d4\u00d3\u00db\u00d6 \u00d7\u00d7\u00d7\u00d4\u00d4\u00d8\u00d8\u00d8 \u00d7 \u00d0\u00d0\u00d0\u00d0 \u00d2\u00d2\u00d6\u00d6\u00d6\u00d7\u00d7\u00d7 \u00d7\u00dd \u00dd\u00dd\u00d3\u00d9\u00d8 \u00bd\u00ba\u00ba \u00d8\u00d8\u00d1\u00d1\u00d7\u00ba", "sha": "01facce80940a30b7eae2e52d483b2849ab9e8b1", "authors": ["Vjekoslav Svilan", "James Burr", "G Tyler"], "doi": null, "journal": null}
+{"title": "The Effect of Scale on the Applicability of Taylor\u2019s Frozen Turbulence Hypothesis in the Atmospheric Boundary Layer", "sha": "0298cfd505d89679b4acde582656896634e5e6e0", "authors": ["Chad Higgins", "Martin Froidevaux", "Valentin Simeonov", "Nikki Vercauteren", "Caitlin Barry", "Marc Parlange"], "doi": "10.1007/s10546-012-9701-1", "journal": "Boundary-Layer Meteorology"}
+{"title": "Interconnect Test Pattern Generation Algorithm For Meeting Device and Global SSO Limits With Safe Initial Vectors", "sha": "02e6fba4eed66f04a92930a9a57fb2687e1a237b", "authors": ["Kendrick Baker"], "doi": null, "journal": null}
+{"title": "Ethnicity and the Immigration of Highly Skilled Workers to the United States", "sha": "03684097bbb71bf17fd708f75d3480c168c99108", "authors": ["Guillermina Jasso"], "doi": null, "journal": null}
+{"title": "Electrically pumped silicon waveguide light sources", "sha": "0468b39bf6361b07c0fb3a02b06751e2b3a73636", "authors": ["Hasitha Jayatilleka", "Arsam Nasrollahy-Shiraz", "Anthony Kenyon"], "doi": null, "journal": null}
+{"title": "Componentwise bounds for nearly completely decomposable Markov chains using stochastic comparison and reordering", "sha": "049d840c4e2563e97bc9a21eafdaee64f49357a5", "authors": ["Nihal Pekergin", "Tu\u011frul Dayar", "Denizhan Alparslan"], "doi": "10.1016/j.ejor.2001.09.001", "journal": "European Journal of Operational Research"}
+{"title": "Size-Dependence of Martensite Transformation Temperature of Yttria-Doped Zirconia and the Distribution of Nucleation Sites", "sha": "04ce43d20e39caffd9e73e844b413e202124f03d", "authors": ["Motozo Hayakawa", "Masanori Tamaki"], "doi": "10.2320/matertrans.MRA2008046]", "journal": null}
+{"title": "Palliative Care Needs of Seriously Ill, Older Adults Presenting to the Emergency Department", "sha": "0516faa504116b73387c35e659d45e521066052b", "authors": ["Corita Grudzen", "Lynne Richardson", "Matthew Morrison", "Elizabeth Cho", "R Sean Morrison"], "doi": "10.1111/j.1553-2712.2010.00907.x", "journal": "Academic Emergency Medicine"}
+{"title": "Modern Social Welfare in the Light of the Sustainability Model", "sha": "0536d30be726c8d489e8aeef87e98b538ceb79d1", "authors": ["Lubov Ivankina", "Tatjana Latygovskaya"], "doi": "10.1016/j.sbspro.2014.12.493", "journal": "Procedia - Social and Behavioral Sciences"}
+{"title": "An eye-tracking methodology for characterizing program comprehension processes", "sha": "055ac7185a7bf5111321c04c0eafc9a46d746d81", "authors": ["Roman Bednarik", "Markku Tukiainen"], "doi": null, "journal": "ETRA"}
+{"title": "HISTORICAL PERSPECTIVE", "sha": "05b2534518cbfb4b2e8a4f5b34231b4d8bae53d5", "authors": [], "doi": null, "journal": null}
+{"title": "Effects of atoms on brittle fracture", "sha": "060745effd7551ea04a085835f4193895be967d2", "authors": ["M Marder"], "doi": null, "journal": "International Journal of Fracture"}
+{"title": null, "sha": "06172b6d2cc713cb64f20d5ddb75cb542e101feb", "authors": [], "doi": null, "journal": null}
+{"title": null, "sha": "062f90fd3284b2ad89fd7165fe2e12b8ea08ef4c", "authors": [], "doi": null, "journal": null}
+{"title": "Parallel Performance Study of Monte Carlo Photon Transport Code on Shared-, Distributed-, and Distributed-Shared-Memory Architectures", "sha": "065036bd75768c0e9f0f31e4527c2ae4287c3eb8", "authors": ["Amitava Majumdar"], "doi": null, "journal": null}
+{"title": "Z-disc Transcriptional Coupling, Sarcomeroptosis and Mechanopoptosis", "sha": "06ae5d02cf7e757c964e12956307dc69d765e5a8", "authors": ["Ralph Kn\u00f6ll", "Byambajav Buyandelger"], "doi": "10.1007/s12013-012-9430-6", "journal": "Cell Biochemistry and Biophysics"}
+{"title": "Navigational Indices and Content Interlinkage On The Fly", "sha": "06c880c7317b58b5cbe38b16034bfd11d4330277", "authors": ["Peter Ziewer"], "doi": null, "journal": null}
+{"title": "ADIPOSE DERIVED MESENCHYMAL STEM CELLS ENHANCE CARDIAC FUNCTION AFTER MYOCARDIAC INFARCTION VIA PARACRINE EFFECT", "sha": "0734aaa781354de047c576a2ecdeb9e21c0f8238", "authors": ["Wei Wang", "Chunyu Zeng"], "doi": "10.1136/heartjnl-2012-302920a.71", "journal": "Heart"}
+{"title": null, "sha": "0800f70e58a458fc660baf733aa1dcbb5f516e24", "authors": [], "doi": null, "journal": null}
+{"title": "Targeting Human C-Type Lectin-like Molecule-1 (CLL1) with a Bispecific Antibody for Immunotherapy of Acute Myeloid Leukemia", "sha": "081c73dc2c39f5ca85584ee74a3904c770f1ab7d", "authors": ["Hua Lu", "Quan Zhou", "Vishal Deshmukh", "Hardeep Phull", "Jennifer Ma", "Virginie Tardif", "Rahul Naik", "Claire Bouvard", "Yong Zhang", "Seihyun Choi", "Brian Lawson", "Shoutian Zhu", "Chan Kim", "Peter Schultz"], "doi": "10.1002/anie.201405353", "journal": "Angewandte Chemie International Edition"}
+{"title": null, "sha": "0877595e8255463b313b5977fbe27188765ad123", "authors": [], "doi": null, "journal": null}
+{"title": "The environment and directed technical change", "sha": "08ab85041fe4023ff5fbbec7bbdc48c8d9ca4b11", "authors": ["Citation Acemoglu", "Philippe Daron", "Leonardo Aghion", "David Bursztyn", "Hemous"], "doi": null, "journal": null}
+{"title": "Adaptive wavelet network for multiple cardiac arrhythmias recognition", "sha": "08c914766160ce311856a73c1648aea3bb9e2ff0", "authors": ["C Lin", "Y Du", "T Chen"], "doi": "10.1016/j.eswa.2007.05.008", "journal": "Expert Systems with Applications"}
+{"title": "Multiple Output Complex Instruction Matching Algorithm for Extensible Processors General Terms Matching algorithms, Instruction Set Architecture", "sha": "09527b517cfbefa52285275f1209368bb32e6016", "authors": ["Puneet Goyal", "Narayan Chaturvedi"], "doi": null, "journal": "International Journal of Computer Applications"}
+{"title": "Semi-blind Block Channel Estimation and Signal Detection Using Hidden Markov Models", "sha": "0a0d7a95553a1c762383c896567c5021ed705891", "authors": ["Pei Chen", "Hisashi Kobayashi"], "doi": null, "journal": null}
+{"title": "Laparoscopic Reconstructive Surgery is Superior to Vaginal Reconstruction in the Pelvic Organ Prolapse", "sha": "0a4f139dde800dc8c45dabbe026f058e56151aa9", "authors": ["Young-Han Park", "Seong Yang", "Sung Park", "Sung Park", "Hong Kim"], "doi": "10.7150/ijms.9027", "journal": "International Journal of Medical Sciences"}
+{"title": "Aspects of kaolinite characterization and retention of Pb and Cd", "sha": "0a6790d14853df562568cd6ceaa17689cf08a55d", "authors": ["Cynthia Coles", "Raymond Yong"], "doi": null, "journal": null}
+{"title": "Comparative Study of Protein Immobilization Properties on Calixarene Monolayers", "sha": "0a7c995028ae1b4188ad3ec0a2669b8712d5cf0c", "authors": ["Hongxia Chen", "Minsu Lee", "Sungwook Choi", "Jae-Ho Kim", "Heung-Jin Choi", "Sung-Hoon Kim", "Jeabeom Lee", "Kwangnak Koh"], "doi": null, "journal": "Sensors"}
+{"title": "Composting Unamended Chicken Manure", "sha": "0b182c89296be22625f3669d89e43b9aba709534", "authors": ["D Elwell", "H Keener", "D Carey", "P Schlak"], "doi": null, "journal": "Compost Science & Utilization"}
+{"title": null, "sha": "0b1c525f6caaffff9e8010ec4ba276e460a017e7", "authors": [], "doi": null, "journal": null}
+{"title": "NEW RESEARCH Pathways to Suicide-Related Behavior in Offspring of Mothers With Depression: The Role of Offspring Psychopathology", "sha": "0bd5b0eb5e91e9721cbca3dd64673e00acc6387a", "authors": ["Gemma Hammerton", "Stanley Zammit", "Liam Mahedy", "Rebecca Pearson", "Ruth Sellers", "Anita Thapar", "Stephan Collishaw"], "doi": null, "journal": "JOURNAL OF THE AMERICAN ACADEMY OF CHILD & ADOLESCENT PSYCHIATRY"}
+{"title": "THE EXISTENCE OF MINIMAL REGULAR LOCAL OVERRINGS FOR AN ARBITRARY DOMAIN", "sha": "0be36d55c0cff44681c3b3bd49c339867b00aa92", "authors": ["Bernard Johnston"], "doi": null, "journal": null}
+{"title": "Crustal melting, ductile flow, and deformation in mountain belts: Cause and effect relationships", "sha": "0bffc30eacbe575a9231e507b6e4bd33a3040f69", "authors": ["M Searle"], "doi": "10.1130/rf.l006.1", "journal": "Lithosphere"}
+{"title": "Performance and Reliability of Strained-Silicon nMOSFETs With SiN Cap Layer", "sha": "0cdec1bf14c99dd1f782c382150f4659f1524922", "authors": ["Gino Giusi", "Felice Crupi", "Eddy Simoen", "Geert Eneman", "Malgorzata Jurczak"], "doi": "10.1109/ted.2006.887198", "journal": "IEEE Transactions on Electron Devices"}
+{"title": "Automatic Proof of Graph Nonisomorphism", "sha": "0ce718957cbc4cd5b59a8e9c53104d860775f9db", "authors": ["Arjeh Cohen", "Jan Knopper", "Scott Murray"], "doi": "10.1007/s11786-008-0052-8", "journal": "Mathematics in Computer Science"}
+{"title": "A Simple Theory of International Trade with Multinational Corporations", "sha": "0d639942cba43fcda79375eb570b0b0169875b45", "authors": ["Elhanan Helpman"], "doi": "10.1086/261236", "journal": "Journal of Political Economy"}
+{"title": "Blocking the APRIL circuit enhances acute myeloid leukemia cell chemosensitivity", "sha": "0d9a6c6c9501934e791339e54eef1c1ddc856ccd", "authors": ["D Bonci", "M Musumeci", "V Coppola", "A Addario", "C Conticello", "M Hahne", "M Gulisano", "F Grignani", "R De Maria"], "doi": "10.3324/haematol.13035", "journal": "Haematologica"}
+{"title": "Local statistics in natural scenes predict the saliency of synthetic textures", "sha": "0ddeb395084fa286f0e98dc58e1403a6e6b5cb0e", "authors": ["Gasper Tkacik", "Gasper Tkacik", "Jason Prentice", "Jonathan Victor", "Vijay Balasubramanian"], "doi": "10.1038/npre.2011.6009.1", "journal": "Nature Precedings"}
+{"title": "A new proof system to verify GDT agents", "sha": "0e2f639cfe1e502c076a43cabcb34fa8036a282d", "authors": ["Bruno Mermet", "Ga\u00eble Simon"], "doi": "10.1007/978-3-319-01571-", "journal": "Studies in Computational Intelligence"}
+{"title": "Comparative study of silicon and germanium sputtering by 1\u201320keV Ar ions", "sha": "0e580a14e701dbb8fd49afcd1683dfa91f66529a", "authors": ["V Shulga"], "doi": "10.1016/j.nimb.2006.11.068", "journal": "Nuclear Instruments and Methods in Physics Research Section B: Beam Interactions with Materials and Atoms"}
+{"title": "Atomistic Theory of Ostwald Ripening and Disintegration of Supported Metal Particles under Reaction Conditions", "sha": "0e668009d3be867dc097d3fd85eb34689f2d3be9", "authors": ["Runhai Ouyang", "Jin-Xun Liu", "Wei-Xue Li"], "doi": "10.1021/ja3087054", "journal": "Journal of the American Chemical Society"}
+{"title": "Prediction of faults-slip-through in large software projects: an empirical evaluation", "sha": "0e66f3fe27bde5f22e73eb7c58b71796ad57069a", "authors": ["Wasif Afzal", "Richard Torkar", "Robert Feldt", "Tony Gorschek"], "doi": "10.1007/s11219-013-9205-3", "journal": "Software Quality Journal"}
+{"title": "Oncogenesis of T-ALL and nonmalignant consequences of overexpressing intracellular NOTCH1", "sha": "0e807dca2f78ddda1e6eb0969764ce275c769e0b", "authors": ["Xiaoyu Li", "Fotini Gounari", "Alexei Protopopov", "Khashayarsha Khazaie", "Harald Von Boehmer"], "doi": "10.1084/jem.20081561", "journal": "The Journal of Experimental Medicine"}
+{"title": "Having a word with yourself: Neural correlates of self-criticism and self-reassurance", "sha": "0e84ae34f2942d66accb38777fffbdafdfcd5511", "authors": ["Olivia Longe", "Frances Maratos", "Paul Gilbert", "Gaynor Evans", "Faye Volker", "Helen Rockliff", "Gina Rippon"], "doi": "10.1016/j.neuroimage.2009.09.019", "journal": "NeuroImage"}
+{"title": "Validation of the Warwick\u2013Edinburgh Mental Well-being Scale (WEMWBS) as an overall indicator of population mental health and well-being in the UK veterinary profession", "sha": "0e935f625db1623c4fd32111a6048e61ee71d816", "authors": ["David Bartram", "Ghasem Yadegarfar", "Julia Sinclair", "David Baldwin"], "doi": "10.1016/j.tvjl.2010.02.010", "journal": "The Veterinary Journal"}
+{"title": "Li-bearing tourmalines in Variscan granitic pegmatites from the Moldanubian nappes, Lower Austria", "sha": "0ea9a3afc9e2507ef2eba02a6efe4a69ec9bf03a", "authors": ["Andreas Ertl", "Ralf Schuster", "John Hughes", "Thomas Ludwig", "Hans-Peter Meyer", "Friedrich Finger", "M Dyar", "Katja Ruschel", "George Rossman", "Urs Kl\u00f6tzli", "Franz Brandst\u00e4tter", "Christian Lengauer", "Ekkehart Tillmanns"], "doi": "10.1127/0935-1221/2012/0024-2203", "journal": "European Journal of Mineralogy"}
+{"title": "Gradient Learning in Spiking Neural Networks by Dynamic Perturbation of Conductances", "sha": "0f6089fb276a8ab926b735b9043263362bf19985", "authors": ["Ila Fiete", "H Seung"], "doi": "10.1103/physrevlett.97.048104", "journal": "Physical Review Letters"}
+{"title": "The correlation of emphysema or airway obstruction with the risk of lung cancer: a matched case-controlled study", "sha": "0f6f47dbeb6bef25f0281c1d8d04d2dbccc507c2", "authors": ["K Kishi", "J Gurney", "D Schroeder", "P Scanlon", "S Swensen", "J Jett"], "doi": "10.1183/09031936.02.00264202", "journal": "European Respiratory Journal"}
+{"title": "Project procurement and disposal decisions: An inventory management model", "sha": "0f810b97278361d3719d9e534e20ad9e9b22ea4f", "authors": ["Keith Willoughby"], "doi": null, "journal": null}
+{"title": "Numerical simulation of Lamb wave propagation in metallic foam sandwich structures: a parametric study", "sha": "0fb9409afe8154563f8530d8e4f330e49a9fbe53", "authors": ["Seyed Hosseini", "Abdolreza Kharaghani", "Christoph Kirsch", "Ulrich Gabbert"], "doi": "10.1016/j.compstruct.2012.10.039", "journal": "Composite Structures"}
+{"title": "(Anti-) Coordination in Networks", "sha": "0fbb9c74cdb89dae0a88033d54d96b7fc31fe123", "authors": ["Jaromir Kovarik", "Friederike Mengel", "Jos\u00e9 Romero"], "doi": null, "journal": null}
+{"title": "Synthesis and Antileishmanial Activity of New 1-Aryl-1H-Pyrazole-4-Carboximidamides Derivatives", "sha": "0fd34746fa5b0fe2fbd05e8542723ef2cae01cf9", "authors": ["Maur\u00edcio Dos Santos", "Adriana Gomes", "Alice Bernardino", "Marcos De Souza", "Misbahul Khan", "Monique De Brito", "Helena Castro", "Paula Abreu", "Carlos Rodrigues"], "doi": null, "journal": "J. Braz. Chem. Soc"}
+{"title": "Shape discrimination and transfer in the California sea lion)", "sha": "1023569cb1f068df6f70b547b2528bbfaf187649", "authors": [], "doi": null, "journal": null}
+{"title": "XCS with Computed Prediction in Continuous Multistep Environments", "sha": "10e7f01df21f05034b4871f1dcf8bb9d83408864", "authors": ["Pier Lanzi", "Daniele Loiacono", "Stewart Wilson", "David Goldberg"], "doi": null, "journal": null}
+{"title": null, "sha": "113edd22f6e661bbdd3ca07752db9f086460d975", "authors": [], "doi": null, "journal": null}
+{"title": "Managing World-Word-Web Publications", "sha": "1155438832880ac7d7863f1438b758fb7f4d29d7", "authors": ["S Foo", "E Lim"], "doi": null, "journal": "Information Management & Computer Security"}
+{"title": "Self-Organization Properties of CSMA/CA Systems and Their Consequences on Fairness", "sha": "1198ffbc0ff68352ba52f37623f312f745562822", "authors": ["Mathilde Durvy", "Olivier Dousse", "Patrick Thiran"], "doi": "10.1109/tit.2008.2011427", "journal": "IEEE Transactions on Information Theory"}
+{"title": "CENTRO STUDI LUCA D'AGLIANO DEVELOPMENT STUDIES WORKING PAPERS Remittances and Financial Openness", "sha": "11c2758cb463ffc56c21640f97c93428f00f2155", "authors": ["Michel Beine", "Elisabetta Lodigiani", "Robert Vermeulen"], "doi": null, "journal": null}
+{"title": "Trading Volume and Cross-Autocorrelations in Stock Returns", "sha": "1226761434526332c98927c364c78d3d452b0624", "authors": ["Tarun Chordia", "Bhaskaran Swaminathan", "Clifford Ball", "Doug Foster", "Roger Huang", "Charles Lee", "Craig Lewis", "Ron Masulis", "Matt Spiegel", "Hans Stoll"], "doi": null, "journal": "THE JOURNAL OF FINANCE \u2022"}
+{"title": "Rigorously modeling self-stabilizing fault-tolerant circuits: An ultra-robust clocking scheme for systems-on-chip", "sha": "124eeff9e2c3d7ea060c45a86248b1ae265ec117", "authors": ["Danny Dolev", "Matthias F\u00fcgger", "Markus Posch", "Ulrich Schmid", "Andreas Steininger", "Christoph Lenzen"], "doi": "10.1016/j.jcss.2014.01.001", "journal": "Journal of Computer and System Sciences"}
+{"title": "Resili\u00eancia e Desenvolvimento Infantil de Crian\u00e7as que Cuidam de Crian\u00e7as: Uma Vis\u00e3o em Perspectiva Resilience and Child Development of Children who Take Care of Children: A Vision in Perspective", "sha": "124fbaf6ebb5ca85aae5f58500fa398f57a158fb", "authors": ["Michele Poletto", "T\u00e2nia Maria", "Cemin Wagner"], "doi": null, "journal": "Psicologia: Teoria e Pesquisa Set-Dez"}
+{"title": "Popper e o espelho de Darwin", "sha": "12666d9e8524d9b06e5385ad4c3a23a4ddc8671d", "authors": ["Darwin Popper", "Maur\u00edcio De", "Carvalho Ramos"], "doi": null, "journal": null}
+{"title": "Barrett esophagus: epidemiology, pathogenesis, diagnosis, and management", "sha": "12ca06aba49eaed25a78515f59b8e61f231091b5", "authors": ["David Estores", "Vic Velanovich"], "doi": "10.1067/j.cpsurg.2013.01.004", "journal": "Current Problems in Surgery"}
+{"title": "Characterization and removal of extra lattice species in faujasites", "sha": "12d47e1c00070ddd4a00fe3e41b149d986368859", "authors": ["Michael Stockenhuber", "J Lercher"], "doi": null, "journal": null}
+{"title": "SELECTIVE W FOR COATING AND RELEASING MEMS DEVICES", "sha": "133b9438d862e462e5fa15993734e2359790ad25", "authors": ["S Mani", "J Fleming", "J Sniegowsu", "M De Boer", "L Irwin", "J Walraven", "D Tanner", "D La"], "doi": null, "journal": null}
+{"title": "The animal model of human amnesia: Long-term memory impaired and short-term memory intact", "sha": "13f0d3ec6f3a280faa269b32e801921d234ed5ea", "authors": ["Pablo Alvarez", "Stuart Zola-Morgan", "Larry Squire"], "doi": null, "journal": "Proc. Nati. Acad. Sci. USA"}
+{"title": "OBSERVATION OF FATIGUE CRACK INITIATION AND EARLY PROPAGATION IN ULTRAFINE-GRAINED STEEL BY ATOMIC FORCE MICROSCOPY", "sha": "1422e6a269d7afea49683280b335a7352789c2dc", "authors": ["H Kimura", "Y Akiniwa", "K Tanaka", "J Kondo", "T Ishikawa"], "doi": null, "journal": null}
+{"title": "Number of Spikes in Climbing Fibers Determines the Direction of Cerebellar Learning", "sha": "15cedc19e97c839a3b6ce923a3b88ac6a6f5352d", "authors": ["A Rasmussen", "D-A Jirenhed", "R Zucca", "F Johansson", "P Svensson", "G Hesslow"], "doi": "10.1523/jneurosci.1527-13.2013", "journal": "Journal of Neuroscience"}
+{"title": "VARIATIONAL METHODS FOR NONLINEAR ELLIPTIC EIGENVALUE PROBLEMS", "sha": "15e36c90d92766af6bf34e705baa8682a42b47ab", "authors": ["Felix Browder"], "doi": null, "journal": null}
+{"title": "Endothelial nitric oxide synthase deficiency results in reduced chondrocyte proliferation and endochondral bone growth", "sha": "161d5d3d9e13662eb81bff67fd9442e2c8e0c999", "authors": ["Qian Yan", "Qingping Feng", "Frank Beier"], "doi": "10.1002/art.27486", "journal": "Arthritis & Rheumatism"}
+{"title": "Right bundle branch block during transvenous ventricular pacing", "sha": "166541f1ad1a8fc06290625e7f1c1fb68a86a6d4", "authors": ["William Abernathy", "J Barry", "Crevey", "Ann"], "doi": null, "journal": null}
+{"title": "Multi-factor Gegenbauer Processes and European Inflation Rates", "sha": "167c256dba61a9800ca49786c532438d12e2e2fa", "authors": ["Guglielmo Caporale", "Luis Gil-Alana"], "doi": null, "journal": null}
+{"title": "DS-LFSR: A BIST TPG for Low Switching Activity", "sha": "168b3167d58cbc2a9311d36a2013cac643990819", "authors": ["Seongmoon Wang", "Sandeep Gupta"], "doi": null, "journal": "IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS"}
+{"title": "Single units and sensation: A neuron doctrine for perceptual psychology?", "sha": "16a4b9d3bdd0fdd9703bf787f6c17ca7aed55c3b", "authors": ["H Barlow"], "doi": null, "journal": "Perception"}
+{"title": "Thermal plume models and melt generation in East Africa: A dynamic modeling approach", "sha": "16dd01f9b58f8ddc90ab7dff82e4e3d86532406f", "authors": ["Shu-Chuan Lin", "Ban-Yuan Kuo", "Ling-Yun Chiao", "Peter Van Keken"], "doi": "10.1016/j.epsl.2005.04.049", "journal": "Earth and Planetary Science Letters"}
+{"title": "Galilean Differential Geometry of Moving Images", "sha": "16fa31f3ff56bd670dae3e046f5f60ddc47b5cac", "authors": ["Daniel Fagerstr\u00f6m"], "doi": null, "journal": null}
+{"title": "Preproghrelin polymorphism Q90L (rs4684677) in gestational diabetes Polimorfismo Q90L (rs4684677) da preprogrelina no diabetes gestacional", "sha": "17043b885f1981ce79cb3dcb8d2fd5004c9c14b8", "authors": ["Rafaela Rocha", "Henrique Frigeri", "Izabella Castilhos", "Ribeiro Dos Santos-Weiss", "Ros\u00e2ngela R\u00e9a", "Emanuel Maltempi De Souza", "Fabiane Gomes", "Moraes Rego", "Geraldo Picheth"], "doi": null, "journal": null}
+{"title": "The lateral parabrachial nucleus is actively involved in the acquisition of fear memory in mice", "sha": "1711f9d40f38bf887674e16752c66d62b5909ee9", "authors": ["Masaru Sato", "Mariko Ito", "Masashi Nagase", "Yae Sugimura", "Yukari Takahashi", "Ayako Watabe", "Fusao Kato"], "doi": "10.1186/s13041-015-0108-z", "journal": "Molecular Brain"}
+{"title": "Hi-Stat Discussion Paper Research Unit for Statistical and Empirical Analysis in Social Sciences (Hi-Stat) Nonparametric Regression for Dependent Data in the Errors-in-Variables Problem", "sha": "1843b161d067ace3318cf167f2b550ca1e88aa6e", "authors": ["Hi Stat", "Toshio Honda"], "doi": null, "journal": null}
+{"title": "On the applicability of available bandwidth estimation techniques and tools", "sha": "1861f48021815c373bd81c64fbba026a2e98c5c5", "authors": ["Cesar Guerrero", "Miguel Labrador"], "doi": "10.1016/j.comcom.2009.08.010", "journal": "Computer Communications"}
+{"title": "Sequential Design in Quality Control and Validation of Land Cover Data Bases", "sha": "1894b4c446e8471ee23fd783be527c6bb387268f", "authors": ["E Carfagna", "J Marzialetti"], "doi": null, "journal": null}
+{"title": "Recognizing Macroeconomic Fluctuations in Value Based Management", "sha": "18b9ff2814b0a0c9d79efa57b171c66dc85870c0", "authors": ["Lars Oxelheim", "Clas Wihlborg"], "doi": null, "journal": null}
+{"title": "Dinuclear pincer-palladium(II) complexes and their use as homogeneous or heterogeneous catalyst for the aldol reaction of methyl isocyanoacetate", "sha": "18c654e234595de3939f758db3ec1714dc848cbb", "authors": ["Raquel Gim\u00e9nez", "Timothy Swager"], "doi": null, "journal": "Journal of Molecular Catalysis A: Chemical"}
+{"title": "Brain activations associated with sign production using word and picture inputs in deaf signers", "sha": "18e94c8f5ff6672737ebb18b4a2a5dc11ee23938", "authors": ["Zhiguo Hu", "Wenjing Wang", "Hongyan Liu", "Danling Peng", "Yanhui Yang", "Kuncheng Li", "John Zhang", "Guosheng Ding"], "doi": "10.1016/j.bandl.2010.11.006", "journal": "Brain and Language"}
+{"title": "A WEIGHT-ADAPTIVE DYNAMIC MODEL FOR SHAPE SEGMENTATION", "sha": "190ced1171d057dc936099d0570276900d7192cf", "authors": ["Klaus Toennies", "Peter Benedix"], "doi": null, "journal": null}
+{"title": "Emotion-prints: interaction-driven emotion visualization on multi-touch interfaces", "sha": "1947f21d1f79168028210b9039832fde6987d963", "authors": ["Daniel Cernea", "Christopher Weber", "Achim Ebert", "Andreas Kerren"], "doi": "10.1117/12.2076473", "journal": "Visualization and Data Analysis 2015"}
+{"title": "Kudzu (Pueraria montana) invasion doubles emissions of nitric oxide and increases ozone pollution", "sha": "197de024dfa683ba4e91462bf7ac0e6bf2b1e5af", "authors": ["J Hickman", "S Wu", "L Mickley", "M Lerdau"], "doi": "10.1073/pnas.0912279107", "journal": "Proceedings of the National Academy of Sciences"}
+{"title": "ATLAS search for a heavy gauge boson decaying to a charged lepton and a neutrino in pp collisions at $\\sqrt{s} = 7\\ \\mathrm{TeV}$", "sha": "19caae86acd7e35fd26a1c78de9ca404f989ad8d", "authors": ["The Atlas Collaboration", "G Aad", "T Abajyan", "B Abbott", "J Abdallah", "S Abdel Khalek", "A Abdelalim", "O Abdinov", "R Aben", "B Abi", "M Abolins", "O Abouzeid", "H Abramowicz", "H Abreu", "B Acharya", "L Adamczyk", "D Adams", "T Addy", "J Adelman", "S Adomeit", "P Adragna", "T Adye", "S Aefsky", "J Aguilar-Saavedra", "M Agustoni", "M Aharrouche", "S Ahlen", "F Ahles", "A Ahmad", "M Ahsan", "G Aielli", "T Akdogan", "T \u00c5kesson", "G Akimoto", "A Akimov", "M Alam", "M Alam", "J Albert", "S Albrand", "M Aleksa", "I Aleksandrov", "F Alessandria", "C Alexa", "G Alexander", "G Alexandre", "T Alexopoulos", "M Alhroob", "M Aliev", "G Alimonti", "J Alison", "B Allbrooke", "P Allport", "S Allwood-Spiers", "J Almond", "A Aloisio", "R Alon", "A Alonso", "F Alonso", "A Altheimer", "B Alvarez Gonzalez", "M Alviggi", "K Amako", "C Amelung", "V Ammosov", "S Amor\u00a0dos\u00a0santos", "A Amorim", "N Amram", "C Anastopoulos", "L Ancu", "N Andari", "T Andeen", "C Anders", "G Anders", "K Anderson", "A Andreazza", "V Andrei", "M-L Andrieux", "X Anduaga", "P Anger", "A Angerami", "F Anghinolfi", "A Anisenkov", "N Anjos", "A Annovi", "A Antonaki", "M Antonelli", "A Antonov", "J Antos", "F Anulli", "M Aoki", "S Aoun", "L Aperio Bella", "R Apolle", "G Arabidze", "I Aracena", "Y Arai", "A Arce", "S Arfaoui", "J-F Arguin", "E Arik", "M Arik", "A Armbruster", "O Arnaez", "V Arnal", "C Arnault", "A Artamonov", "G Artoni", "D Arutinov", "S Asai", "R Asfandiyarov", "S Ask", "B \u00c5sman", "L Asquith", "K Assamagan", "A Astbury", "M Atkinson", "B Aubert", "E Auge", "K Augsten", "M Aurousseau", "G Avolio", "R Avramidou", "D Axen", "G Azuelos", "Y Azuma", "M Baak", "G Baccaglioni", "C Bacci", "A Bach", "H Bachacou", "K Bachas", "M Backes", "M Backhaus", "E Badescu", "P Bagnaia", "S Bahinipati", "Y Bai", "D Bailey", "T Bain", "J Baines", "O Baker", "M Baker", "S Baker", "E Banas", "P Banerjee", "S Banerjee", "D Banfi", "A Bangert", "V Bansal", "H Bansil", "L Barak", "S Baranov", "A Barbaro Galtieri", "T Barber", "E Barberio", "D Barberis", "M Barbero", "D Bardin", "T Barillari", "M Barisonzi", "T Barklow", "N Barlow", "B Barnett", "R Barnett", "A Baroncelli", "G Barone", "A Barr", "F Barreiro", "J Barreiro Guimar\u00e3es\u00a0da\u00a0costa", "P Barrillon", "R Bartoldus", "A Barton", "V Bartsch", "A Basye", "R Bates", "L Batkova", "J Batley", "A Battaglia", "M Battistin", "F Bauer", "H Bawa", "S Beale", "T Beau", "P Beauchemin", "R Beccherle", "P Bechtle", "H Beck", "A Becker", "S Becker", "M Beckingham", "K Becks", "A Beddall", "A Beddall", "S Bedikian", "V Bednyakov", "C Bee", "L Beemster", "M Begel", "S Behar Harpaz", "P Behera", "M Beimforde", "C Belanger-Champagne", "P Bell", "W Bell", "G Bella", "L Bellagamba", "F Bellina", "M Bellomo", "A Belloni", "O Beloborodova", "K Belotskiy", "O Beltramello", "O Benary", "D Benchekroun", "K Bendtz", "N Benekos", "Y Benhammou", "E Benhar Noccioli", "J Benitez Garcia", "D Benjamin", "M Benoit", "J Bensinger", "K Benslama", "S Bentvelsen", "D Berge", "E Bergeaas Kuutmann", "N Berger", "F Berghaus", "E Berglund", "J Beringer", "P Bernat", "R Bernhard", "C Bernius", "T Berry", "C Bertella", "A Bertin", "F Bertolucci", "M Besana", "G Besjes", "N Besson", "S Bethke", "W Bhimji", "R Bianchi", "M Bianco", "O Biebel", "S Bieniek", "K Bierwagen", "J Biesiada", "M Biglietti", "H Bilokon", "M Bindi", "S Binet", "A Bingul", "C Bini", "C Biscarat", "B Bittner", "K Black", "R Blair", "J-B Blanchard", "G Blanchot", "T Blazek", "I Bloch", "C Blocker", "J Blocki", "A Blondel", "W Blum", "U Blumenschein", "G Bobbink", "V Bobrovnikov", "S Bocchetta", "A Bocci", "C Boddy", "M Boehler", "J Boek", "N Boelaert", "J Bogaerts", "A Bogdanchikov", "A Bogouch", "C Bohm", "J Bohm", "V Boisvert", "T Bold", "V Boldea", "N Bolnet", "M Bomben", "M Bona", "M Boonekamp", "S Bordoni", "C Borer", "A Borisov", "G Borissov", "I Borjanovic", "M Borri", "S Borroni", "V Bortolotto", "K Bos", "D Boscherini", "M Bosman", "H Boterenbrood", "J Bouchami", "J Boudreau", "E Bouhova-Thacker", "D Boumediene", "C Bourdarios", "N Bousson", "A Boveia", "J Boyd", "I Boyko", "I Bozovic-Jelisavcic", "J Bracinik", "P Branchini", "G Brandenburg", "A Brandt", "G Brandt", "O Brandt", "U Bratzler", "B Brau", "J Brau", "H Braun", "S Brazzale", "B Brelier", "J Bremer", "K Brendlinger", "R Brenner", "S Bressler", "D Britton", "F Brochu", "I Brock", "R Brock", "F Broggi", "C Bromberg", "J Bronner", "G Brooijmans", "T Brooks", "W Brooks", "G Brown", "H Brown", "P Bruckman\u00a0de\u00a0renstrom", "D Bruncko", "R Bruneliere", "S Brunet", "A Bruni", "G Bruni", "M Bruschi", "T Buanes", "Q Buat", "F Bucci", "J Buchanan", "P Buchholz", "R Buckingham", "A Buckley", "S Buda", "I Budagov", "B Budick", "V B\u00fcscher", "L Bugge", "M Bugge", "O Bulekov", "A Bundock", "M Bunse", "T Buran", "H Burckhart", "S Burdin", "T Burgess", "S Burke", "E Busato", "P Bussey", "C Buszello", "B Butler", "J Butler", "C Buttar", "J Butterworth", "W Buttinger", "S Cabrera Urb\u00e1n", "D Caforio", "O Cakir", "P Calafiura", "G Calderini", "P Calfayan", "R Calkins", "L Caloba", "R Caloi", "D Calvet", "S Calvet", "R Camacho Toro", "P Camarri", "D Cameron", "L Caminada", "R Caminal Armadans", "S Campana", "M Campanelli", "V Canale", "F Canelli", "A Canepa", "J Cantero", "R Cantrill", "L Capasso", "M Capeans Garrido", "I Caprini", "M Caprini", "D Capriotti", "M Capua", "R Caputo", "R Cardarelli", "T Carli", "G Carlino", "L Carminati", "B Caron", "S Caron", "E Carquin", "G Carrillo Montoya", "A Carter", "J Carter", "J Carvalho", "D Casadei", "M Casado", "M Cascella", "C Caso", "A Castaneda Hernandez", "E Castaneda-Miranda", "V Castillo Gimenez", "N Castro", "G Cataldi", "P Catastini", "A Catinaccio", "J Catmore", "A Cattai", "G Cattani", "S Caughron", "V Cavaliere", "P Cavalleri", "D Cavalli", "M Cavalli-Sforza", "V Cavasinni", "F Ceradini", "A Cerqueira", "A Cerri", "L Cerrito", "F Cerutti", "S Cetin", "A Chafaq", "D Chakraborty", "I Chalupkova", "K Chan", "P Chang", "B Chapleau", "J Chapman", "J Chapman", "E Chareyre", "D Charlton", "V Chavda", "C Chavez Barajas", "S Cheatham", "S Chekanov", "S Chekulaev", "G Chelkov", "M Chelstowska", "C Chen", "H Chen", "S Chen", "X Chen", "Y Chen", "A Cheplakov", "R Cherkaoui\u00a0el\u00a0moursli", "V Chernyatin", "E Cheu", "S Cheung", "L Chevalier", "G Chiefari", "L Chikovani", "J Childers", "A Chilingarov", "G Chiodini", "A Chisholm", "R Chislett", "A Chitan", "M Chizhov", "G Choudalakis", "S Chouridou", "I Christidi", "A Christov", "D Chromek-Burckhart", "M Chu", "J Chudoba", "G Ciapetti", "A Ciftci", "R Ciftci", "D Cinca", "V Cindro", "C Ciocca", "A Ciocio", "M Cirilli", "P Cirkovic", "Z Citron", "M Citterio", "M Ciubancan", "A Clark", "P Clark", "R Clarke", "W Cleland", "J Clemens", "B Clement", "C Clement", "Y Coadou", "M Cobal", "A Coccaro", "J Cochran", "L Coffey", "J Cogan", "J Coggeshall", "E Cogneras", "J Colas", "S Cole", "A Colijn", "N Collins", "C Collins-Tooth", "J Collot", "T Colombo", "G Colon", "P Conde Mui\u00f1o", "E Coniavitis", "M Conidi", "S Consonni", "V Consorti", "S Constantinescu", "C Conta", "G Conti", "F Conventi", "M Cooke", "B Cooper", "A Cooper-Sarkar", "K Copic", "T Cornelissen", "M Corradi", "F Corriveau", "A Cortes-Gonzalez", "G Cortiana", "G Costa", "M Costa", "D Costanzo", "D C\u00f4t\u00e9", "L Courneyea", "G Cowan", "C Cowden", "B Cox", "K Cranmer", "F Crescioli", "M Cristinziani", "G Crosetti", "S Cr\u00e9p\u00e9-Renaudin", "C-M Cuciuc", "C Cuenca Almenar", "T Cuhadar Donszelmann", "M Curatolo", "C Curtis", "C Cuthbert", "P Cwetanski", "H Czirr", "P Czodrowski", "Z Czyczula", "S D\u2019auria", "M D\u2019onofrio", "A D\u2019orazio", "M Da Cunha Sargedas\u00a0de\u00a0sousa", "C Da Via", "W Dabrowski", "A Dafinca", "T Dai", "C Dallapiccola", "M Dam", "M Dameri", "D Damiani", "H Danielsson", "V Dao", "G Darbo", "G Darlea", "J Dassoulas", "W Davey", "T Davidek", "N Davidson", "R Davidson", "E Davies", "M Davies", "O Davignon", "A Davison", "Y Davygora", "E Dawe", "I Dawson", "R Daya-Ishmukhametova", "K De", "R De Asmundis", "S De Castro", "S De Cecco", "J De Graat", "N De Groot", "P De Jong", "C De\u00a0la Taille", "H De\u00a0la Torre", "F De Lorenzi", "L De Mora", "L De Nooij", "D De Pedis", "A De Salvo", "U De Sanctis", "A De Santo", "J De Vivie\u00a0de\u00a0regie", "G De Zorzi", "W Dearnaley", "R Debbe", "C Debenedetti", "B Dechenaux", "D Dedovich", "J Degenhardt", "C Del Papa", "J Del Peso", "T Del Prete", "T Delemontex", "M Deliyergiyev", "A Dell\u2019acqua", "L Dell\u2019asta", "M Della Pietra", "D Della Volpe", "M Delmastro", "P Delsart", "C Deluca", "S Demers", "M Demichev", "B Demirkoz", "J Deng", "S Denisov", "D Derendarz", "J Derkaoui", "F Derue", "P Dervan", "K Desch", "E Devetak", "P Deviveiros", "A Dewhurst", "B Dewilde", "S Dhaliwal", "R Dhullipudi", "A Di Ciaccio", "L Di Ciaccio", "A Di Girolamo", "B Di Girolamo", "S Di Luise", "A Di Mattia", "B Di Micco", "R Di Nardo", "A Di Simone", "R Di Sipio", "M Diaz", "E Diehl", "J Dietrich", "T Dietzsch", "S Diglio", "K Dindar Yagci", "J Dingfelder", "F Dinut", "C Dionisi", "P Dita", "S Dita", "F Dittus", "F Djama", "T Djobava", "M Do Vale", "A Do Valle Wemans", "T Doan", "M Dobbs", "R Dobinson", "D Dobos", "E Dobson", "J Dodd", "C Doglioni", "T Doherty", "Y Doi", "J Dolejsi", "I Dolenc", "Z Dolezal", "B Dolgoshein", "T Dohmae", "M Donadelli", "J Donini", "J Dopke", "A Doria", "A Dos Anjos", "A Dotti", "M Dova", "A Doxiadis", "A Doyle", "N Dressnandt", "M Dris", "J Dubbert", "S Dube", "E Duchovni", "G Duckeck", "D Duda", "A Dudarev", "F Dudziak", "M D\u00fchrssen", "I Duerdoth", "L Duflot", "M-A Dufour", "L Duguid", "M Dunford", "H Duran Yildiz", "R Duxfield", "M Dwuznik", "F Dydak", "M D\u00fcren", "W Ebenstein", "J Ebke", "S Eckweiler", "K Edmonds", "W Edson", "C Edwards", "N Edwards", "W Ehrenfeld", "T Eifert", "G Eigen", "K Einsweiler", "E Eisenhandler", "T Ekelof", "M El Kacimi", "M Ellert", "S Elles", "F Ellinghaus", "K Ellis", "N Ellis", "J Elmsheuser", "M Elsing", "D Emeliyanov", "R Engelmann", "A Engl", "B Epp", "J Erdmann", "A Ereditato", "D Eriksson", "J Ernst", "M Ernst", "J Ernwein", "D Errede", "S Errede", "E Ertel", "M Escalier", "H Esch", "C Escobar", "X Espinal Curull", "B Esposito", "F Etienne", "A Etienvre", "E Etzion", "D Evangelakou", "H Evans", "L Fabbri", "C Fabre", "R Fakhrutdinov", "S Falciano", "Y Fang", "M Fanti", "A Farbin", "A Farilla", "J Farley", "T Farooque", "S Farrell", "S Farrington", "P Farthouat", "F Fassi", "P Fassnacht", "D Fassouliotis", "B Fatholahzadeh", "A Favareto", "L Fayard", "S Fazio", "R Febbraro", "P Federic", "O Fedin", "W Fedorko", "M Fehling-Kaschek", "L Feligioni", "D Fellmann", "C Feng", "E Feng", "A Fenyuk", "J Ferencei", "W Fernando", "S Ferrag", "J Ferrando", "V Ferrara", "A Ferrari", "P Ferrari", "R Ferrari", "D Ferreira\u00a0de\u00a0lima", "A Ferrer", "D Ferrere", "C Ferretti", "A Ferretto Parodi", "M Fiascaris", "F Fiedler", "A Filip\u010di\u010d", "F Filthaut", "M Fincke-Keeler", "M Fiolhais", "L Fiorini", "A Firan", "G Fischer", "M Fisher", "M Flechl", "I Fleck", "J Fleckner", "P Fleischmann", "S Fleischmann", "T Flick", "A Floderus", "L Flores Castillo", "M Flowerdew", "T Fonseca Martin", "A Formica", "A Forti", "D Fortin", "D Fournier", "A Fowler", "H Fox", "P Francavilla", "M Franchini", "S Franchino", "D Francis", "T Frank", "S Franz", "M Fraternali", "S Fratina", "S French", "C Friedrich", "F Friedrich", "R Froeschl", "D Froidevaux", "J Frost", "C Fukunaga", "E Fullana Torregrosa", "B Fulsom", "J Fuster", "C Gabaldon", "O Gabizon", "T Gadfort", "S Gadomski", "G Gagliardi", "P Gagnon", "C Galea", "B Galhardo", "E Gallas", "V Gallo", "B Gallop", "P Gallus", "K Gan", "Y Gao", "A Gaponenko", "F Garberson", "M Garcia-Sciveres", "C Garc\u00eda", "J Garc\u00eda Navarro", "R Gardner", "N Garelli", "H Garitaonandia", "V Garonne", "C Gatti", "G Gaudio", "B Gaur", "L Gauthier", "P Gauzzi", "I Gavrilenko", "C Gay", "G Gaycken", "E Gazis", "P Ge", "Z Gecse", "C Gee", "D Geerts", "C Geich-Gimbel", "K Gellerstedt", "C Gemme", "A Gemmell", "M Genest", "S Gentile", "M George", "S George", "P Gerlach", "A Gershon", "C Geweniger", "H Ghazlane", "N Ghodbane", "B Giacobbe", "S Giagu", "V Giakoumopoulou", "V Giangiobbe", "F Gianotti", "B Gibbard", "A Gibson", "S Gibson", "M Gilchriese", "D Gillberg", "A Gillman", "D Gingrich", "J Ginzburg", "N Giokaris", "M Giordani", "R Giordano", "F Giorgi", "P Giovannini", "P Giraud", "D Giugni", "M Giunta", "P Giusti", "B Gjelsten", "L Gladilin", "C Glasman", "J Glatzer", "A Glazov", "K Glitza", "G Glonti", "J Goddard", "J Godfrey", "J Godlewski", "M Goebel", "T G\u00f6pfert", "C Goeringer", "C G\u00f6ssling", "S Goldfarb", "T Golling", "A Gomes", "L Gomez Fajardo", "R Gon\u00e7alo", "J Goncalves Pinto Firmino\u00a0da\u00a0costa", "L Gonella", "S Gonz\u00e1lez\u00a0de\u00a0la\u00a0hoz", "G Gonzalez Parra", "M Gonzalez Silva", "S Gonzalez-Sevilla", "J Goodson", "L Goossens", "P Gorbounov", "H Gordon", "I Gorelov", "G Gorfine", "B Gorini", "E Gorini", "A Gori\u0161ek", "E Gornicki", "B Gosdzik", "A Goshaw", "M Gosselink", "M Gostkin", "I Gough Eschrich", "M Gouighri", "D Goujdami", "M Goulette", "A Goussiou", "C Goy", "S Gozpinar", "I Grabowska-Bold", "P Grafstr\u00f6m", "K-J Grahn", "F Grancagnolo", "S Grancagnolo", "V Grassi", "V Gratchev", "N Grau", "H Gray", "J Gray", "E Graziani", "O Grebenyuk", "T Greenshaw", "Z Greenwood", "K Gregersen", "I Gregor", "P Grenier", "J Griffiths", "N Grigalashvili", "A Grillo", "S Grinstein", "P Gris", "Y Grishkevich", "J-F Grivaz", "E Gross", "J Grosse-Knetter", "J Groth-Jensen", "K Grybel", "D Guest", "C Guicheney", "S Guindon", "U Gul", "H Guler", "J Gunther", "B Guo", "J Guo", "P Gutierrez", "N Guttman", "O Gutzwiller", "C Guyot", "C Gwenlan", "C Gwilliam", "A Haas", "S Haas", "C Haber", "H Hadavand", "D Hadley", "P Haefner", "F Hahn", "S Haider", "Z Hajduk", "H Hakobyan", "D Hall", "J Haller", "K Hamacher", "P Hamal", "K Hamano", "M Hamer", "A Hamilton", "S Hamilton", "L Han", "K Hanagaki", "K Hanawa", "M Hance", "C Handel", "P Hanke", "J Hansen", "J Hansen", "J Hansen", "P Hansen", "P Hansson", "K Hara", "G Hare", "T Harenberg", "S Harkusha", "D Harper", "R Harrington", "O Harris", "J Hartert", "F Hartjes", "T Haruyama", "A Harvey", "S Hasegawa", "Y Hasegawa", "S Hassani", "S Haug", "M Hauschild", "R Hauser", "M Havranek", "C Hawkes", "R Hawkings", "A Hawkins", "T Hayakawa", "T Hayashi", "D Hayden", "C Hays", "H Hayward", "S Haywood", "S Head", "V Hedberg", "L Heelan", "S Heim", "B Heinemann", "S Heisterkamp", "L Helary", "C Heller", "M Heller", "S Hellman", "D Hellmich", "C Helsens", "R Henderson", "M Henke", "A Henrichs", "A Henriques Correia", "S Henrot-Versille", "C Hensel", "T Hen\u00df", "C Hernandez", "Y Hern\u00e1ndez Jim\u00e9nez", "R Herrberg", "G Herten", "R Hertenberger", "L Hervas", "G Hesketh", "N Hessey", "E Hig\u00f3n-Rodriguez", "J Hill", "K Hiller", "S Hillert", "S Hillier", "I Hinchliffe", "E Hines", "M Hirose", "F Hirsch", "D Hirschbuehl", "J Hobbs", "N Hod", "M Hodgkinson", "P Hodgson", "A Hoecker", "M Hoeferkamp", "J Hoffman", "D Hoffmann", "M Hohlfeld", "M Holder", "S Holmgren", "T Holy", "J Holzbauer", "T Hong", "L Hooft\u00a0van\u00a0huysduynen", "S Horner", "J-Y Hostachy", "S Hou", "A Hoummada", "J Howard", "J Howarth", "I Hristova", "J Hrivnac", "T Hryn\u2019ova", "P Hsu", "S-C Hsu", "D Hu", "Z Hubacek", "F Hubaut", "F Huegging", "A Huettmann", "T Huffman", "E Hughes", "G Hughes", "M Huhtinen", "M Hurwitz", "U Husemann", "N Huseynov", "J Huston", "J Huth", "G Iacobucci", "G Iakovidis", "M Ibbotson", "I Ibragimov", "L Iconomidou-Fayard", "J Idarraga", "P Iengo", "O Igonkina", "Y Ikegami", "M Ikeno", "D Iliadis", "N Ilic", "T Ince", "J Inigo-Golfin", "P Ioannou", "M Iodice", "K Iordanidou", "V Ippolito", "A Irles Quiles", "C Isaksson", "M Ishino", "M Ishitsuka", "R Ishmukhametov", "C Issever", "S Istin", "A Ivashin", "W Iwanski", "H Iwasaki", "J Izen", "V Izzo", "B Jackson", "J Jackson", "P Jackson", "M Jaekel", "V Jain", "K Jakobs", "S Jakobsen", "T Jakoubek", "J Jakubek", "D Jana", "E Jansen", "H Jansen", "A Jantsch", "M Janus", "G Jarlskog", "L Jeanty", "I Jen-La Plante", "D Jennens", "P Jenni", "A Loevschall-Jensen", "P Je\u017e", "S J\u00e9z\u00e9quel", "M Jha", "H Ji", "W Ji", "J Jia", "Y Jiang", "M Jimenez Belenguer", "S Jin", "O Jinnouchi", "M Joergensen", "D Joffe", "M Johansen", "K Johansson", "P Johansson", "S Johnert", "K Johns", "K Jon-And", "G Jones", "R Jones", "T Jones", "C Joram", "P Jorge", "K Joshi", "J Jovicevic", "T Jovin", "X Ju", "C Jung", "R Jungst", "V Juranek", "P Jussel", "A Juste Rozas", "S Kabana", "M Kaci", "A Kaczmarska", "P Kadlecik", "M Kado", "H Kagan", "M Kagan", "E Kajomovitz", "S Kalinin", "L Kalinovskaya", "S Kama", "N Kanaya", "M Kaneda", "S Kaneti", "T Kanno", "V Kantserov", "J Kanzaki", "B Kaplan", "A Kapliy", "J Kaplon", "D Kar", "M Karagounis", "K Karakostas", "M Karnevskiy", "V Kartvelishvili", "A Karyukhin", "L Kashif", "G Kasieczka", "R Kass", "A Kastanas", "M Kataoka", "Y Kataoka", "E Katsoufis", "J Katzy", "V Kaushik", "K Kawagoe", "T Kawamoto", "G Kawamura", "M Kayl", "S Kazama", "V Kazanin", "M Kazarinov", "R Keeler", "P Keener", "R Kehoe", "M Keil", "G Kekelidze", "J Keller", "M Kenyon", "O Kepka", "N Kerschen", "B Ker\u0161evan", "S Kersten", "K Kessoku", "J Keung", "F Khalil-Zada", "H Khandanyan", "A Khanov", "D Kharchenko", "A Khodinov", "A Khomich", "T Khoo", "G Khoriauli", "A Khoroshilov", "V Khovanskiy", "E Khramov", "J Khubua", "H Kim", "S Kim", "N Kimura", "O Kind", "B King", "M King", "R King", "J Kirk", "A Kiryunin", "T Kishimoto", "D Kisielewska", "T Kitamura", "T Kittelmann", "K Kiuchi", "E Kladiva", "M Klein", "U Klein", "K Kleinknecht", "M Klemetti", "A Klier", "P Klimek", "A Klimentov", "R Klingenberg", "J Klinger", "E Klinkby", "T Klioutchnikova", "P Klok", "S Klous", "E-E Kluge", "T Kluge", "P Kluit", "S Kluth", "N Knecht", "E Kneringer", "E Knoops", "A Knue", "B Ko", "T Kobayashi", "M Kobel", "M Kocian", "P Kodys", "K K\u00f6neke", "A K\u00f6nig", "S Koenig", "L K\u00f6pke", "F Koetsveld", "P Koevesarki", "T Koffas", "E Koffeman", "L Kogan", "S Kohlmann", "F Kohn", "Z Kohout", "T Kohriki", "T Koi", "G Kolachev", "H Kolanoski", "V Kolesnikov", "I Koletsou", "J Koll", "A Komar", "Y Komori", "T Kondo", "T Kono", "A Kononov", "R Konoplich", "N Konstantinidis", "S Koperny", "K Korcyl", "K Kordas", "A Korn", "A Korol", "I Korolkov", "E Korolkova", "V Korotkov", "O Kortner", "S Kortner", "V Kostyukhin", "S Kotov", "V Kotov", "A Kotwal", "C Kourkoumelis", "V Kouskoura", "A Koutsman", "R Kowalewski", "T Kowalski", "W Kozanecki", "A Kozhin", "V Kral", "V Kramarenko", "G Kramberger", "M Krasny", "A Krasznahorkay", "J Kraus", "S Kreiss", "F Krejci", "J Kretzschmar", "N Krieger", "P Krieger", "K Kroeninger", "H Kroha", "J Kroll", "J Kroseberg", "J Krstic", "U Kruchonak", "H Kr\u00fcger", "T Kruker", "N Krumnack", "Z Krumshteyn", "T Kubota", "S Kuday", "S Kuehn", "A Kugel", "T Kuhl", "D Kuhn", "V Kukhtin", "Y Kulchitsky", "S Kuleshov", "C Kummer", "M Kuna", "J Kunkle", "A Kupco", "H Kurashige", "M Kurata", "Y Kurochkin", "V Kus", "E Kuwertz", "M Kuze", "J Kvita", "R Kwee", "A La Rosa", "L La Rotonda", "L Labarga", "J Labbe", "S Lablak", "C Lacasta", "F Lacava", "H Lacker", "D Lacour", "V Lacuesta", "E Ladygin", "R Lafaye", "B Laforge", "T Lagouri", "S Lai", "E Laisne", "M Lamanna", "L Lambourne", "C Lampen", "W Lampl", "E Lancon", "U Landgraf", "M Landon", "J Lane", "V Lang", "C Lange", "A Lankford", "F Lanni", "K Lantzsch", "S Laplace", "C Lapoire", "J Laporte", "T Lari", "A Larner", "M Lassnig", "P Laurelli", "V Lavorini", "W Lavrijsen", "P Laycock", "O Le Dortz", "E Le Guirriec", "E Le Menedeu", "T Lecompte", "F Ledroit-Guillon", "H Lee", "J Lee", "S Lee", "L Lee", "M Lefebvre", "M Legendre", "F Legger", "C Leggett", "M Lehmacher", "G Lehmann Miotto", "X Lei", "M Leite", "R Leitner", "D Lellouch", "B Lemmer", "V Lendermann", "K Leney", "T Lenz", "G Lenzen", "B Lenzi", "K Leonhardt", "S Leontsinis", "F Lepold", "C Leroy", "J-R Lessard", "C Lester", "C Lester", "J Lev\u00eaque", "D Levin", "L Levinson", "A Lewis", "G Lewis", "A Leyko", "M Leyton", "B Li", "H Li", "S Li", "X Li", "Z Liang", "H Liao", "B Liberti", "P Lichard", "M Lichtnecker", "K Lie", "W Liebig", "C Limbach", "A Limosani", "M Limper", "S Lin", "F Linde", "J Linnemann", "E Lipeles", "A Lipniacka", "T Liss", "D Lissauer", "A Lister", "A Litke", "C Liu", "D Liu", "H Liu", "J Liu", "L Liu", "M Liu", "Y Liu", "M Livan", "S Livermore", "A Lleres", "J Llorente Merino", "S Lloyd", "E Lobodzinska", "P Loch", "W Lockman", "T Loddenkoetter", "F Loebinger", "A Loginov", "C Loh", "T Lohse", "K Lohwasser", "M Lokajicek", "V Lombardo", "R Long", "L Lopes", "D Lopez Mateos", "J Lorenz", "N Lorenzo Martinez", "M Losada", "P Loscutoff", "F Lo Sterzo", "M Losty", "X Lou", "A Lounis", "K Loureiro", "J Love", "P Love", "A Lowe", "F Lu", "H Lubatti", "C Luci", "A Lucotte", "A Ludwig", "D Ludwig", "I Ludwig", "J Ludwig", "F Luehring", "G Luijckx", "W Lukas", "L Luminari", "E Lund", "B Lund-Jensen", "B Lundberg", "J Lundberg", "O Lundberg", "J Lundquist", "M Lungwitz", "D Lynn", "E Lytken", "H Ma", "L Ma", "G Maccarrone", "A Macchiolo", "B Ma\u010dek", "J Machado Miguens", "R Mackeprang", "R Madaras", "H Maddocks", "W Mader", "R Maenner", "T Maeno", "P M\u00e4ttig", "S M\u00e4ttig", "L Magnoni", "E Magradze", "K Mahboubi", "J Mahlstedt", "S Mahmoud", "G Mahout", "C Maiani", "C Maidantchik", "A Maio", "S Majewski", "Y Makida", "N Makovec", "P Mal", "B Malaescu", "P Malecki", "P Malecki", "V Maleev", "F Malek", "U Mallik", "D Malon", "C Malone", "S Maltezos", "V Malyshev", "S Malyukov", "R Mameghani", "J Mamuzic", "A Manabe", "L Mandelli", "I Mandi\u0107", "R Mandrysch", "J Maneira", "A Manfredini", "P Mangeard", "L Manhaes\u00a0de\u00a0andrade Filho", "J Manjarres Ramos", "A Mann", "P Manning", "A Manousakis-Katsikakis", "B Mansoulie", "A Mapelli", "L Mapelli", "L March", "J Marchand", "F Marchese", "G Marchiori", "M Marcisovsky", "C Marino", "F Marroquim", "Z Marshall", "F Martens", "L Marti", "S Marti-Garcia", "B Martin", "B Martin", "J Martin", "T Martin", "V Martin", "B Martin\u00a0dit\u00a0latour", "S Martin-Haugh", "M Martinez", "V Martinez Outschoorn", "A Martyniuk", "M Marx", "F Marzano", "A Marzin", "L Masetti", "T Mashimo", "R Mashinistov", "J Masik", "A Maslennikov", "I Massa", "G Massaro", "N Massol", "P Mastrandrea", "A Mastroberardino", "T Masubuchi", "P Matricon", "H Matsunaga", "T Matsushita", "C Mattravers", "J Maurer", "S Maxfield", "A Mayne", "R Mazini", "M Mazur", "L Mazzaferro", "M Mazzanti", "J Mc Donald", "S Mc Kee", "A Mccarn", "R Mccarthy", "T Mccarthy", "N Mccubbin", "K Mcfarlane", "J Mcfayden", "G Mchedlidze", "T Mclaughlan", "S Mcmahon", "R Mcpherson", "A Meade", "J Mechnich", "M Mechtel", "M Medinnis", "R Meera-Lebbai", "T Meguro", "R Mehdiyev", "S Mehlhase", "A Mehta", "K Meier", "B Meirose", "C Melachrinos", "B Mellado Garcia", "F Meloni", "L Mendoza Navas", "Z Meng", "A Mengarelli", "S Menke", "E Meoni", "K Mercurio", "P Mermod", "L Merola", "C Meroni", "F Merritt", "H Merritt", "A Messina", "J Metcalfe", "A Mete", "C Meyer", "C Meyer", "J-P Meyer", "J Meyer", "J Meyer", "T Meyer", "J Miao", "S Michal", "L Micu", "R Middleton", "S Migas", "L Mijovi\u0107", "G Mikenberg", "M Mikestikova", "M Miku\u017e", "D Miller", "R Miller", "W Mills", "C Mills", "A Milov", "D Milstead", "D Milstein", "A Minaenko", "M Mi\u00f1ano Moya", "I Minashvili", "A Mincer", "B Mindur", "M Mineev", "Y Ming", "L Mir", "G Mirabelli", "J Mitrevski", "V Mitsou", "S Mitsui", "P Miyagawa", "J Mj\u00f6rnmark", "T Moa", "V Moeller", "K M\u00f6nig", "N M\u00f6ser", "S Mohapatra", "W Mohr", "R Moles-Valls", "A Molfetas", "J Monk", "E Monnier", "J Montejo Berlingen", "F Monticelli", "S Monzani", "R Moore", "G Moorhead", "C Mora Herrera", "A Moraes", "N Morange", "J Morel", "G Morello", "D Moreno", "M Moreno Ll\u00e1cer", "P Morettini", "M Morgenstern", "M Morii", "A Morley", "G Mornacchi", "J Morris", "L Morvaj", "H Moser", "M Mosidze", "J Moss", "R Mount", "E Mountricha", "S Mouraviev", "E Moyse", "F Mueller", "J Mueller", "K Mueller", "T M\u00fcller", "T Mueller", "D Muenstermann", "Y Munwes", "W Murray", "I Mussche", "E Musto", "A Myagkov", "M Myska", "J Nadal", "K Nagai", "R Nagai", "K Nagano", "A Nagarkar", "Y Nagasaka", "M Nagel", "A Nairz", "Y Nakahama", "K Nakamura", "T Nakamura", "I Nakano", "G Nanava", "A Napier", "R Narayan", "M Nash", "T Nattermann", "T Naumann", "G Navarro", "H Neal", "P Nechaeva", "T Neep", "A Negri", "G Negri", "M Negrini", "S Nektarijevic", "A Nelson", "T Nelson", "S Nemecek", "P Nemethy", "A Nepomuceno", "M Nessi", "M Neubauer", "M Neumann", "A Neusiedl", "R Neves", "P Nevski", "F Newcomer", "P Newman", "V Nguyen Thi Hong", "R Nickerson", "R Nicolaidou", "B Nicquevert", "F Niedercorn", "J Nielsen", "N Nikiforou", "A Nikiforov", "V Nikolaenko", "I Nikolic-Audit", "K Nikolics", "K Nikolopoulos", "H Nilsen", "P Nilsson", "Y Ninomiya", "A Nisati", "R Nisius", "T Nobe", "L Nodulman", "M Nomachi", "I Nomidis", "S Norberg", "M Nordberg", "P Norton", "J Novakova", "M Nozaki", "L Nozka", "I Nugent", "A-E Nuncio-Quiroz", "G Nunes Hanninger", "T Nunnemann", "E Nurse", "B O\u2019brien", "D O\u2019neil", "V O\u2019shea", "L Oakes", "F Oakham", "H Oberlack", "J Ocariz", "A Ochi", "S Oda", "S Odaka", "J Odier", "H Ogren", "A Oh", "S Oh", "C Ohm", "T Ohshima", "H Okawa", "Y Okumura", "T Okuyama", "A Olariu", "A Olchevski", "S Olivares Pino", "M Oliveira", "D Oliveira Damazio", "E Oliver Garcia", "D Olivito", "A Olszewski", "J Olszowska", "A Onofre", "P Onyisi", "C Oram", "M Oreglia", "Y Oren", "D Orestano", "N Orlando", "I Orlov", "C Oropeza Barrera", "R Orr", "B Osculati", "R Ospanov", "C Osuna", "G Otero\u00a0y\u00a0garzon", "J Ottersbach", "M Ouchrif", "E Ouellette", "F Ould-Saada", "A Ouraou", "Q Ouyang", "A Ovcharova", "M Owen", "S Owen", "V Ozcan", "N Ozturk", "A Pacheco Pages", "C Padilla Aranda", "S Pagan Griso", "E Paganis", "C Pahl", "F Paige", "P Pais", "K Pajchel", "G Palacino", "C Paleari", "S Palestini", "D Pallin", "A Palma", "J Palmer", "Y Pan", "E Panagiotopoulou", "P Pani", "N Panikashvili", "S Panitkin", "D Pantea", "A Papadelis", "Th. Papadopoulou", "A Paramonov", "D Paredes Hernandez", "W Park", "M Parker", "F Parodi", "J Parsons", "U Parzefall", "S Pashapour", "E Pasqualucci", "S Passaggio", "A Passeri", "F Pastore", "F Pastore", "G P\u00e1sztor", "S Pataraia", "N Patel", "J Pater", "S Patricelli", "T Pauly", "M Pecsy", "S Pedraza Lopez", "M Pedraza Morales", "S Peleganchuk", "D Pelikan", "H Peng", "B Penning", "A Penson", "J Penwell", "M Perantoni", "K Perez", "T Perez Cavalcanti", "E Perez Codina", "M P\u00e9rez Garc\u00eda-Esta\u00f1", "V Perez Reale", "L Perini", "H Pernegger", "R Perrino", "P Perrodo", "V Peshekhonov", "K Peters", "B Petersen", "J Petersen", "T Petersen", "E Petit", "A Petridis", "C Petridou", "E Petrolo", "F Petrucci", "D Petschull", "M Petteni", "R Pezoa", "A Phan", "P Phillips", "G Piacquadio", "A Picazio", "E Piccaro", "M Piccinini", "S Piec", "R Piegaia", "D Pignotti", "J Pilcher", "A Pilkington", "J Pina", "M Pinamonti", "A Pinder", "J Pinfold", "B Pinto", "C Pizio", "M Plamondon", "M-A Pleier", "E Plotnikova", "A Poblaguev", "S Poddar", "F Podlyski", "L Poggioli", "D Pohl", "M Pohl", "G Polesello", "A Policicchio", "A Polini", "J Poll", "V Polychronakos", "D Pomeroy", "K Pomm\u00e8s", "L Pontecorvo", "B Pope", "G Popeneciu", "D Popovic", "A Poppleton", "X Portell Bueso", "G Pospelov", "S Pospisil", "I Potrap", "C Potter", "C Potter", "G Poulard", "J Poveda", "V Pozdnyakov", "R Prabhu", "P Pralavorio", "A Pranko", "S Prasad", "R Pravahan", "S Prell", "K Pretzl", "D Price", "J Price", "L Price", "D Prieur", "M Primavera", "K Prokofiev", "F Prokoshin", "S Protopopescu", "J Proudfoot", "X Prudent", "M Przybycien", "H Przysiezniak", "S Psoroulas", "E Ptacek", "E Pueschel", "J Purdham", "M Purohit", "P Puzo", "Y Pylypchenko", "J Qian", "A Quadt", "D Quarrie", "W Quayle", "F Quinonez", "M Raas", "V Radeka", "V Radescu", "P Radloff", "T Rador", "F Ragusa", "G Rahal", "A Rahimi", "D Rahm", "S Rajagopalan", "M Rammensee", "M Rammes", "A Randle-Conde", "K Randrianarivony", "F Rauscher", "T Rave", "M Raymond", "A Read", "D Rebuzzi", "A Redelbach", "G Redlinger", "R Reece", "K Reeves", "E Reinherz-Aronis", "A Reinsch", "I Reisinger", "C Rembser", "Z Ren", "A Renaud", "M Rescigno", "S Resconi", "B Resende", "P Reznicek", "R Rezvani", "R Richter", "E Richter-Was", "M Ridel", "M Rijpstra", "M Rijssenbeek", "A Rimoldi", "L Rinaldi", "R Rios", "I Riu", "G Rivoltella", "F Rizatdinova", "E Rizvi", "S Robertson", "A Robichaud-Veronneau", "D Robinson", "J Robinson", "A Robson", "J Rocha\u00a0de\u00a0lima", "C Roda", "D Roda\u00a0dos\u00a0santos", "A Roe", "S Roe", "O R\u00f8hne", "S Rolli", "A Romaniouk", "M Romano", "G Romeo", "E Romero Adam", "N Rompotis", "L Roos", "E Ros", "S Rosati", "K Rosbach", "A Rose", "M Rose", "G Rosenbaum", "E Rosenberg", "P Rosendahl", "O Rosenthal", "L Rosselet", "V Rossetti", "E Rossi", "L Rossi", "M Rotaru", "I Roth", "J Rothberg", "D Rousseau", "C Royon", "A Rozanov", "Y Rozen", "X Ruan", "F Rubbo", "I Rubinskiy", "N Ruckstuhl", "V Rud", "C Rudolph", "G Rudolph", "F R\u00fchr", "A Ruiz-Martinez", "L Rumyantsev", "Z Rurikova", "N Rusakovich", "J Rutherfoord", "C Ruwiedel", "P Ruzicka", "Y Ryabov", "M Rybar", "G Rybkin", "N Ryder", "A Saavedra", "I Sadeh", "Null- Sadrozinski", "R Sadykov", "F Safai Tehrani", "H Sakamoto", "G Salamanna", "A Salamon", "M Saleem", "D Salek", "D Salihagic", "A Salnikov", "J Salt", "B Salvachua Ferrando", "D Salvatore", "F Salvatore", "A Salvucci", "A Salzburger", "D Sampsonidis", "B Samset", "A Sanchez", "V Sanchez Martinez", "H Sandaker", "H Sander", "M Sanders", "M Sandhoff", "T Sandoval", "C Sandoval", "R Sandstroem", "D Sankey", "A Sansoni", "C Santamarina Rios", "C Santoni", "R Santonico", "H Santos", "J Saraiva", "T Sarangi", "E Sarkisyan-Grinbaum", "F Sarri", "G Sartisohn", "O Sasaki", "Y Sasaki", "N Sasao", "I Satsounkevitch", "G Sauvage", "E Sauvan", "J Sauvan", "P Savard", "V Savinov", "D Savu", "L Sawyer", "D Saxon", "J Saxon", "C Sbarra", "A Sbrizzi", "D Scannicchio", "M Scarcella", "J Schaarschmidt", "P Schacht", "D Schaefer", "U Sch\u00e4fer", "S Schaepe", "S Schaetzel", "A Schaffer", "D Schaile", "R Schamberger", "A Schamov", "V Scharf", "V Schegelsky", "D Scheirich", "M Schernau", "M Scherzer", "C Schiavi", "J Schieck", "M Schioppa", "S Schlenker", "E Schmidt", "K Schmieden", "C Schmitt", "S Schmitt", "M Schmitz", "B Schneider", "U Schnoor", "A Schoening", "A Schorlemmer", "M Schott", "D Schouten", "J Schovancova", "M Schram", "C Schroeder", "N Schroer", "M Schultens", "J Schultes", "H-C Schultz-Coulon", "H Schulz", "M Schumacher", "B Schumm", "P Schune", "C Schwanenberger", "A Schwartzman", "P Schwegler", "P Schwemling", "R Schwienhorst", "R Schwierz", "J Schwindling", "T Schwindt", "M Schwoerer", "G Sciolla", "W Scott", "J Searcy", "G Sedov", "E Sedykh", "S Seidel", "A Seiden", "F Seifert", "J Seixas", "G Sekhniaidze", "S Sekula", "K Selbach", "D Seliverstov", "B Sellden", "G Sellers", "M Seman", "N Semprini-Cesari", "C Serfon", "L Serin", "L Serkin", "R Seuster", "H Severini", "A Sfyrla", "E Shabalina", "M Shamim", "L Shan", "J Shank", "Q Shao", "M Shapiro", "P Shatalov", "K Shaw", "D Sherman", "P Sherwood", "S Shimizu", "M Shimojima", "T Shin", "M Shiyakova", "A Shmeleva", "M Shochet", "D Short", "S Shrestha", "E Shulga", "M Shupe", "P Sicho", "A Sidoti", "F Siegert", "D Sijacki", "O Silbert", "J Silva", "Y Silver", "D Silverstein", "S Silverstein", "V Simak", "O Simard", "L Simic", "S Simion", "E Simioni", "B Simmons", "R Simoniello", "M Simonyan", "P Sinervo", "N Sinev", "V Sipica", "G Siragusa", "A Sircar", "A Sisakyan", "S Sivoklokov", "J Sj\u00f6lin", "T Sjursen", "L Skinnari", "H Skottowe", "K Skovpen", "P Skubic", "M Slater", "T Slavicek", "K Sliwa", "V Smakhtin", "B Smart", "L Smestad", "S Smirnov", "Y Smirnov", "L Smirnova", "O Smirnova", "B Smith", "D Smith", "K Smith", "M Smizanska", "K Smolek", "A Snesarev", "S Snow", "J Snow", "S Snyder", "R Sobie", "J Sodomka", "A Soffer", "C Solans", "M Solar", "J Solc", "E Soldatov", "U Soldevila", "E Solfaroli Camillocci", "A Solodkov", "O Solovyanov", "V Solovyev", "N Soni", "V Sopko", "B Sopko", "M Sosebee", "R Soualah", "A Soukharev", "S Spagnolo", "F Span\u00f2", "R Spighi", "G Spigo", "R Spiwoks", "M Spousta", "T Spreitzer", "B Spurlock", "R St. Denis", "J Stahlman", "R Stamen", "E Stanecka", "R Stanek", "C Stanescu", "M Stanescu-Bellu", "M Stanitzki", "S Stapnes", "E Starchenko", "J Stark", "P Staroba", "P Starovoitov", "R Staszewski", "A Staude", "P Stavina", "G Steele", "P Steinbach", "P Steinberg", "I Stekl", "B Stelzer", "H Stelzer", "O Stelzer-Chilton", "H Stenzel", "S Stern", "G Stewart", "J Stillings", "M Stockton", "K Stoerig", "G Stoicea", "S Stonjek", "P Strachota", "A Stradling", "A Straessner", "J Strandberg", "S Strandberg", "A Strandlie", "M Strang", "E Strauss", "M Strauss", "P Strizenec", "R Str\u00f6hmer", "D Strom", "J Strong", "R Stroynowski", "J Strube", "B Stugu", "I Stumer", "J Stupak", "P Sturm", "N Styles", "D Soh", "D Su", "Hs. Subramania", "A Succurro", "Y Sugaya", "C Suhr", "M Suk", "V Sulin", "S Sultansoy", "T Sumida", "X Sun", "J Sundermann", "K Suruliz", "G Susinno", "M Sutton", "Y Suzuki", "Y Suzuki", "M Svatos", "S Swedish", "I Sykora", "T Sykora", "J S\u00e1nchez", "D Ta", "K Tackmann", "A Taffard", "R Tafirout", "N Taiblum", "Y Takahashi", "H Takai", "R Takashima", "H Takeda", "T Takeshita", "Y Takubo", "M Talby", "A Talyshev", "M Tamsett", "K Tan", "J Tanaka", "R Tanaka", "S Tanaka", "S Tanaka", "A Tanasijczuk", "K Tani", "N Tannoury", "S Tapprogge", "D Tardif", "S Tarem", "F Tarrade", "G Tartarelli", "P Tas", "M Tasevsky", "E Tassi", "M Tatarkhanov", "Y Tayalati", "C Taylor", "F Taylor", "G Taylor", "W Taylor", "M Teinturier", "F Teischinger", "M Teixeira Dias Castanheira", "P Teixeira-Dias", "K Temming", "H Ten Kate", "P Teng", "S Terada", "K Terashi", "J Terron", "M Testa", "R Teuscher", "J Therhaag", "T Theveneaux-Pelzer", "S Thoma", "J Thomas", "E Thompson", "P Thompson", "P Thompson", "A Thompson", "L Thomsen", "E Thomson", "M Thomson", "W Thong", "R Thun", "F Tian", "M Tibbetts", "T Tic", "V Tikhomirov", "Y Tikhonov", "S Timoshenko", "P Tipton", "S Tisserant", "T Todorov", "S Todorova-Nova", "B Toggerson", "J Tojo", "S Tok\u00e1r", "K Tokushuku", "K Tollefson", "M Tomoto", "L Tompkins", "K Toms", "A Tonoyan", "C Topfel", "N Topilin", "I Torchiani", "E Torrence", "H Torres", "E Torr\u00f3 Pastor", "J Toth", "F Touchard", "D Tovey", "T Trefzger", "L Tremblet", "A Tricoli", "I Trigger", "S Trincaz-Duvoid", "M Tripiana", "N Triplett", "W Trischuk", "B Trocm\u00e9", "C Troncon", "M Trottier-Mcdonald", "M Trzebinski", "A Trzupek", "C Tsarouchas", "Null- Tseng", "M Tsiakiris", "P Tsiareshka", "D Tsionou", "G Tsipolitis", "S Tsiskaridze", "V Tsiskaridze", "E Tskhadadze", "I Tsukerman", "V Tsulaia", "J-W Tsung", "S Tsuno", "D Tsybychev", "A Tua", "A Tudorache", "V Tudorache", "J Tuggle", "M Turala", "D Turecek", "I Turk Cakir", "E Turlay", "R Turra", "P Tuts", "A Tykhonov", "M Tylmad", "M Tyndel", "G Tzanakos", "K Uchida", "I Ueda", "R Ueno", "M Ugland", "M Uhlenbrock", "M Uhrmacher", "F Ukegawa", "G Unal", "A Undrus", "G Unel", "Y Unno", "D Urbaniec", "P Urquijo", "G Usai", "M Uslenghi", "L Vacavant", "V Vacek", "B Vachon", "S Vahsen", "J Valenta", "S Valentinetti", "A Valero", "S Valkar", "E Valladolid Gallego", "S Vallecorsa", "J Valls Ferrer", "R Van Berg", "P Van\u00a0der Deijl", "R Van\u00a0der Geer", "H Van\u00a0der Graaf", "R Van\u00a0der Leeuw", "E Van\u00a0der Poel", "D Van\u00a0der Ster", "N Van Eldik", "P Van Gemmeren", "I Van Vulpen", "M Vanadia", "W Vandelli", "A Vaniachine", "P Vankov", "F Vannucci", "R Vari", "T Varol", "D Varouchas", "A Vartapetian", "K Varvell", "V Vassilakopoulos", "F Vazeille", "T Vazquez Schroeder", "G Vegni", "J Veillet", "F Veloso", "R Veness", "S Veneziano", "A Ventura", "D Ventura", "M Venturi", "N Venturi", "V Vercesi", "M Verducci", "W Verkerke", "J Vermeulen", "A Vest", "M Vetterli", "I Vichou", "T Vickey", "O Vickey Boeriu", "G Viehhauser", "S Viel", "M Villa", "M Villaplana Perez", "E Vilucchi", "M Vincter", "E Vinek", "V Vinogradov", "M Virchaux", "J Virzi", "O Vitells", "M Viti", "I Vivarelli", "F Vives Vaque", "S Vlachos", "D Vladoiu", "M Vlasak", "A Vogel", "P Vokac", "G Volpi", "M Volpi", "G Volpini", "H Von\u00a0der Schmitt", "H Von Radziewski", "E Von Toerne", "V Vorobel", "V Vorwerk", "M Vos", "R Voss", "T Voss", "J Vossebeld", "N Vranjes", "M Vranjes Milosavljevic", "V Vrba", "M Vreeswijk", "T Vu Anh", "R Vuillermet", "I Vukotic", "W Wagner", "P Wagner", "H Wahlen", "S Wahrmund", "J Wakabayashi", "S Walch", "J Walder", "R Walker", "W Walkowiak", "R Wall", "P Waller", "B Walsh", "C Wang", "H Wang", "H Wang", "J Wang", "J Wang", "R Wang", "S Wang", "T Wang", "A Warburton", "C Ward", "M Warsinsky", "A Washbrook", "C Wasicki", "I Watanabe", "P Watkins", "A Watson", "I Watson", "M Watson", "G Watts", "S Watts", "A Waugh", "B Waugh", "M Weber", "P Weber", "A Weidberg", "P Weigell", "J Weingarten", "C Weiser", "P Wells", "T Wenaus", "D Wendland", "Z Weng", "T Wengler", "S Wenig", "N Wermes", "M Werner", "P Werner", "M Werth", "M Wessels", "J Wetter", "C Weydert", "K Whalen", "S Wheeler-Ellis", "A White", "M White", "S White", "S Whitehead", "D Whiteson", "D Whittington", "F Wicek", "D Wicke", "F Wickens", "W Wiedenmann", "M Wielers", "P Wienemann", "C Wiglesworth", "L Wiik-Fuchs", "P Wijeratne", "A Wildauer", "M Wildt", "I Wilhelm", "H Wilkens", "J Will", "E Williams", "H Williams", "W Willis", "S Willocq", "J Wilson", "M Wilson", "A Wilson", "I Wingerter-Seez", "S Winkelmann", "F Winklmeier", "M Wittgen", "S Wollstadt", "M Wolter", "H Wolters", "W Wong", "G Wooden", "B Wosiek", "J Wotschack", "M Woudstra", "K Wozniak", "K Wraight", "M Wright", "B Wrona", "S Wu", "X Wu", "Y Wu", "E Wulf", "B Wynne", "S Xella", "M Xiao", "S Xie", "C Xu", "D Xu", "B Yabsley", "S Yacoob", "M Yamada", "H Yamaguchi", "A Yamamoto", "K Yamamoto", "S Yamamoto", "T Yamamura", "T Yamanaka", "J Yamaoka", "T Yamazaki", "Y Yamazaki", "Z Yan", "H Yang", "U Yang", "Y Yang", "Z Yang", "S Yanush", "L Yao", "Y Yao", "Y Yasu", "G Ybeles Smit", "J Ye", "S Ye", "M Yilmaz", "R Yoosoofmiya", "K Yorita", "R Yoshida", "C Young", "C Young", "S Youssef", "D Yu", "J Yu", "J Yu", "L Yuan", "A Yurkewicz", "M Byszewski", "B Zabinski", "R Zaidan", "A Zaitsev", "Z Zajacova", "L Zanello", "D Zanzi", "A Zaytsev", "C Zeitnitz", "M Zeman", "A Zemla", "C Zendler", "O Zenin", "T \u017deni\u0161", "Z Zinonos", "S Zenz", "D Zerwas", "G Zevi\u00a0della\u00a0porta", "Z Zhan", "D Zhang", "H Zhang", "J Zhang", "X Zhang", "Z Zhang", "L Zhao", "T Zhao", "Z Zhao", "A Zhemchugov", "J Zhong", "B Zhou", "N Zhou", "Y Zhou", "C Zhu", "H Zhu", "J Zhu", "Y Zhu", "X Zhuang", "V Zhuravlov", "D Zieminska", "N Zimin", "R Zimmermann", "S Zimmermann", "S Zimmermann", "M Ziolkowski", "R Zitoun", "L \u017divkovi\u0107", "V Zmouchko", "G Zobernig", "A Zoccoli", "M Zur Nedden", "V Zutshi", "L Zwalinski"], "doi": "10.1140/epjc/s10052-012-2241-5", "journal": "The European Physical Journal C"}
+{"title": "Mirrors with regular hexagonal segments", "sha": "19d5d3272074f3d49879cd7c97c28dcc633b3b76", "authors": ["Dario Amodei", "Stephen Padin"], "doi": null, "journal": null}
+{"title": "Leakage Biased pMOS Sleep Switch Dynamic Circuits", "sha": "1a2c05a969beb61f32e5824d9fd916c68dc50ea8", "authors": ["Z Liu", "V Kursun"], "doi": "10.1109/tcsii.2006.882206", "journal": "IEEE Transactions on Circuits and Systems II: Express Briefs"}
+{"title": "Item memory, source memory, and the medial temporal lobe: Concordant findings from fMRI and memory-impaired patients", "sha": "1a979e3af440652b8964c05dd6b0c81e5d19d76d", "authors": ["Jeffrey Gold", "Christine Smith", "Peter Bayley", "Yael Shrager", "James Brewer", "Craig Stark", "Ramona Hopkins", "Larry Squire", "\u2021 \u00a7 \u00a7 \u00b6 \u00b6"], "doi": null, "journal": null}
+{"title": "Prologue: nonclassical modalities of myocardial preconditioning", "sha": "1aaac38af6d1b816bafe69820b1cd9b7036de6d4", "authors": ["Garrett Gross", "David Warltier"], "doi": "10.1152/ajpheart.00147.2002", "journal": "American Journal of Physiology - Heart and Circulatory Physiology"}
+{"title": "The relationship between atypical semantic activation and odd speech in schizotypy across emotionally evocative conditions", "sha": "1ae06871cf35bd24c27950f818d69d66e982cb02", "authors": ["Kyle Minor", "Alex Cohen", "Christopher Weber", "Laura Brown"], "doi": "10.1016/j.schres.2010.06.016", "journal": "Schizophrenia Research"}
+{"title": "Thermophoresis at a charged surface: the role of hydrodynamic slip", "sha": "1ae635422b9092a62baf7daf78c0bf9c03ea5804", "authors": ["Julien Morthomas", "Alois W\u00fcrger"], "doi": null, "journal": "Journal of Physics: Condensed Matter"}
+{"title": "Assessment of the elasticity properties of the ascending aorta in patients with subclinical hypothyroidism by tissue Doppler imaging Avalia\u00e7\u00e3o das propriedades de elasticidade da aorta ascendente em pacientes com hipotiroidismo subcl\u00ednico por imagem de Doppler tecidual", "sha": "1b12efebe1b24f89953322a970c38f0ed380e8a8", "authors": ["Mustafa Yurtda\u015f", "T\u00fcrkay \u00d6zcan", "Ramazan Gen", "Kas\u0131m Ayd\u0131n"], "doi": null, "journal": "Arq Bras Endocrinol Metab"}
+{"title": "Distinct prognostic values and potential drug targets of ALDH1 isoenzymes in non-small-cell lung cancer", "sha": "1b32d3c4dae5e34bbab7ecf9eaba8e60d0c9ff24", "authors": ["Qinghua You", "Dongxiang Xu", "Huanchen Guo"], "doi": "10.2147/dddt.s87197", "journal": "Drug Design, Development and Therapy"}
+{"title": "Audiometric assessment for military personnel Avalia\u00e7\u00e3o do perfil auditivo de militares de um quartel do", "sha": "1b5dccaded1bcc516c03cdd412d8d8cc62bc05d1", "authors": ["Ana Silva", "Everardo Da Costa", "Salete Rodrigues", "Humberto Souza", "Val\u00e9ria Massafera", "Ex\u00e9rcito Brasileiro"], "doi": null, "journal": "REVISTA BRASILEIRA DE OTORRINOLARINGOLOGIA"}
+{"title": "Generalized monotonicity analysis", "sha": "1bb70d82b0076844d84751e48949a2c1f3ff705a", "authors": ["Bruno Strulovici", "Thomas Weber"], "doi": "10.1007/s00199-009-0450-4", "journal": "Economic Theory"}
+{"title": "Identifying the Independent Inertial Parameter Space of Robot Manipulators", "sha": "1c22f804f7ec2af6799260f79195cdfb690991e6", "authors": ["Shih-Ying Sheu", "Michael Walker", "Ann Arbor", "Michigan"], "doi": null, "journal": null}
+{"title": "Testing and modelling non-normality within the one-factor model", "sha": "1ceb3a7565ee6676d0ca7b459f851f650c7a243f", "authors": ["Dylan Molenaar", "Conor Dolan", "Norman Verhelst"], "doi": "10.1348/000711009x456935", "journal": "British Journal of Mathematical and Statistical Psychology"}
+{"title": "The Structure and Complexity of Extreme Nash Equilibria", "sha": "1cfbdafb7230183f919bca17320f948244f6b2d2", "authors": ["M Gairing", "T L\u00fccking", "M Mavronicolas", "B Monien", "P Spirakis"], "doi": null, "journal": null}
+{"title": "A Service of zbw A unified theory of firm selection and growth", "sha": "1cfe57825a9e5700a2c20913219cc873ed318979", "authors": [], "doi": null, "journal": null}
+{"title": "Spatial distribution pattern of Mytilus chilensis beds in the Reloncav\u00ed fjord: hypothesis on associated processes", "sha": "1d7d96cd4225193a43a38f9523b0948fe11e2f81", "authors": ["Carlos Flores", "Manuel Gomez", "Camilo Mu\u00f1oz", "Leny P\u00e9rez", "Sandra Arribas", "Marcela Opazo", "Edwin Huaquin"], "doi": "10.1186/s40693-015-0041-7", "journal": "Revista Chilena de Historia Natural"}
+{"title": "Algorithm 892: DISPMODULE, a Fortran 95 module for pretty-printing ma-trices", "sha": "1dd7a777a3b578f98ca35d30e3bdde65b989cefd", "authors": ["Kristjan Jonasson"], "doi": null, "journal": "ACM Trans. Math. Softw"}
+{"title": "A Language-Based Approach for Improving the Robustness of Network Application Protocol Implementations A Language-Based Approach for Improving the Robustness of Network Application Protocol Implementations. [Research", "sha": "1ee93e4ae97c19cd14a024a99e0bfcafaa937320", "authors": ["Laurent Burgy", "Laurent R\u00e9veil\u00ec", "Julia Lawall", "Gilles Muller", "Laurent Burgy", "Laurent R\u00e9veil\u00ec", "Julia Lawall", "Gilles"], "doi": null, "journal": null}
+{"title": "Transfer RNA detection by small RNA deep sequencing and disease association with myelodysplastic syndromes", "sha": "1ef9e53c100e5560f68a9b52ce248b4cb92f7f29", "authors": ["Yan Guo", "Amma Bosompem", "Sanjay Mohan", "Begum Erdogan", "Fei Ye", "Kasey Vickers", "Quanhu Sheng", "Shilin Zhao", "Null- Li", "Pei-Fang Su", "Madan Jagasia", "Stephen Strickland", "Elizabeth Griffiths", "Annette Kim"], "doi": "10.1186/s12864-015-1929-y", "journal": "BMC Genomics"}
+{"title": "Magnetism of substitutional Co impurities in graphene: Realization of single\u03c0vacancies", "sha": "1f6c1d96d44b7f4a7fff38045f1e6e9f7be92bc1", "authors": ["E Santos", "D S\u00e1nchez-Portal", "A Ayuela"], "doi": "10.1103/physrevb.81.125433", "journal": "Physical Review B"}
+{"title": "A Service of zbw Leibniz-Informationszentrum Wirtschaft Leibniz Information Centre for Economics The Comitology Game: European Policymaking with Parliamentary Involvement", "sha": "20c9deda71acca3f754dc5ae01780c3115c41802", "authors": ["Bernard Steunenberg", "Dieter Schmidtchen"], "doi": null, "journal": null}
+{"title": "The inverse scattering problem of some Schr\u00f6dinger type equation with turning point", "sha": "21174e246c8a174bf15328cd9437344f2605c8ac", "authors": ["Zaki El-Raheem", "Farouk Salama"], "doi": "10.1186/s13661-015-0316-6", "journal": "Boundary Value Problems"}
+{"title": "Effects of mode degeneracy in the LIGO Livingston Observatory recycling cavity", "sha": "214447a16cf3afab71f39f2b0576f3e121fe0573", "authors": ["Andri Gretarsson", "Erika D&apos;ambrosio", "Valery Frolov", "Brian O&apos;reilly", "Peter Fritschel"], "doi": null, "journal": null}
+{"title": "Spatial distribution of wind turbines is crucial for the survival of red kite populations", "sha": "21bca26de097b8f054aae69fec0c4001c4496407", "authors": ["Michael Schaub"], "doi": "10.1016/j.biocon.2012.06.021", "journal": "Biological Conservation"}
+{"title": "TILTING SATURN. I. ANALYTIC MODEL", "sha": "21c73610de2394fa0f203f595b62df2ff39a2373", "authors": ["William Ward", "Douglas Hamilton"], "doi": null, "journal": null}
+{"title": "Reactive oxygen species contribute to dysfunction of bone marrow hematopoietic stem cells in aged C57BL/6 J mice", "sha": "21cf2cb1d067a39f682c3fa4a97f04f7a9c83724", "authors": ["Marcella Porto", "Bianca Rodrigues", "Thiago Menezes", "Sara Ceschim", "Dulce Casarini", "Agata Gava", "Thiago Pereira", "Elisardo Vasquez", "Bianca Campagnaro", "Silvana Meyrelles"], "doi": "10.1186/s12929-015-0201-8", "journal": "Journal of Biomedical Science"}
+{"title": "Dynamic Hash Tables Daniel! Sleator Editor", "sha": "21ea85d0f30d9eb9c370b2e2ac38f81e766215b4", "authors": ["Pedke Larson"], "doi": null, "journal": null}
+{"title": "A Service of zbw Leibniz-Informationszentrum Wirtschaft Leibniz Information Centre for Economics Family Tax Splitting: A Microsimulation of its Potential Labour Supply and Intra-household Welfare Effects in Germany", "sha": "22365b76518ff793e7d34b5e58a36b1770551e32", "authors": ["Fran\u00e7ois Laisney", "Denis Beninger", "Miriam Beblo"], "doi": null, "journal": null}
+{"title": "SOME SIMPLE ANALYTICS", "sha": "230ea2621a71b7fb9833a39e9a0ddd3c9f135b99", "authors": ["Martin Feldstein"], "doi": null, "journal": null}
+{"title": "State Prevailing Wage Laws and School Construction Costs", "sha": "232260eeb59aaecbf045a45fcf6b50aba450ee1e", "authors": ["Hamid Azari-Rad", "Peter Philips", "Mark Prus"], "doi": null, "journal": null}
+{"title": "Fluid resuscitation in severe sepsis and septic shock: An evidence-based review", "sha": "23777308bde73123fae87daf223ae0b5f444843a", "authors": ["Jean-Louis Vincent", "Herwig Gerlach"], "doi": "10.1097/01.ccm.0000142984.44321.a4", "journal": "Critical Care Medicine"}
+{"title": "A Political Economy of the African School as a Learning Organization", "sha": "23eaa72bd4543d04262334852b6f287f519438cd", "authors": ["Aok Noah"], "doi": "10.5296/ijld.v3i3.3861", "journal": "International Journal of Learning and Development"}
+{"title": "Quantitative Study of Cortical Orientation Selectivity in Visually Inexperienced Kitten", "sha": "23eaf51e6de401ba9f8232de9eba966123d05826", "authors": ["Helen Sherk", "Michael Stryker"], "doi": null, "journal": null}
+{"title": "Pharmacodynamics and kinetics of omeprazole MUPS 20 mg and pantoprazole 40 mg during repeated oral administration in Helicobacter pylori-negative subjects", "sha": "24152ff3a336f3d5fc2a4439a9f456ecd728b1ff", "authors": ["W Geus", "R Matho \u00c3 T", "P Mulder\u00e0", "C Lamers"], "doi": null, "journal": null}
+{"title": "Algorithm 736: Hyperelliptic Integrals and the Surface Measure of Ellipsoids", "sha": "24417e305419177dc24ca4aff2075ede098e5a30", "authors": ["Charles Dunkl", "Donald Ramirez"], "doi": null, "journal": null}
+{"title": "Hybrid Modulation-Doping of Solution-Processed Ultrathin Layers of ZnO Using Molecular Dopants", "sha": "244884ce4854750d173da815f764fe9ebbfd8cfb", "authors": ["Stefan Schie\u00dfl", "Hendrik Faber", "Yen-Hung Lin", "Stephan Rossbauer", "Qingxiao Wang", "Kui Zhao", "Aram Amassian", "Jana Zaumseil", "Thomas Anthopoulos"], "doi": "10.1002/adma.201503200", "journal": "Advanced Materials"}
+{"title": "PORCELLANID CRABS (CRUSTACEA, DECAPODA) INHABITING SAND REEFS BUILT BY Phragmatopoma lapidosa (Polychaeta, Sabellariidae) AT PARANAPU\u00c3 BEACH, S\u00c3O VICENTE, SP, BRAZIL", "sha": "24899b7224b6ad9ca66fdce5471f30060eab5957", "authors": ["Micheletti-Flores", "C", "Negreiros-Fransozo", "M"], "doi": null, "journal": "Rev. Brasil. Biol"}
+{"title": "Geochemistry of mafic phenocrysts from alkaline lamprophyres of the Spanish Central System: implications on crystal fractionation, magma mixing and xenoliths entrapment within deep magma chambers", "sha": "25113383c54bcf6c14d1bb4cc4e4b79deb04e825", "authors": ["David Orejana", "Carlos Villaseca", "Bruce Pater", "So"], "doi": null, "journal": null}
+{"title": "The timing and magnitude of Stroop interference and facilitation in monolinguals and bilinguals", "sha": "25826b46813613a3a068c30a32a563c941c9adb2", "authors": ["Emily Coderre", "Walter Van Heuven", "Kathy Conklin"], "doi": "10.1017/s1366728912000405", "journal": "Bilingualism: Language and Cognition"}
+{"title": "Experimental study on a continuous adsorption water chiller with novel design", "sha": "25ff350f5185c1a73ce356858d4ad6fd88c29fdf", "authors": ["Y Liu", "R Wang", "Z Xia"], "doi": "10.1016/j.ijrefrig.2004.09.004", "journal": "International Journal of Refrigeration"}
+{"title": "Methotrexate: Final 10-year Data in Longstanding Rheumatoid Arthritis Clinical, Functional, and Radiographic Benefits of Longterm Adalimumab Plus", "sha": "262e2dd6d02ed65774db817a11543e1162f3ebe1", "authors": ["Shufang Liu", "Beno\u00eet Gu\u00e9rette", "Neelufar Mozaffarian", "Edward Keystone", "D\u00e9sir\u00e9e Van Der Heijde", "Arthur Kavanaugh", "Hartmut Kupper"], "doi": null, "journal": "The Journal of Rheumatology Journal of Rheumatology The on October"}
+{"title": "HARDY'S INEQUALITY IN A VARIABLE EXPONENT SOBOLEV SPACE", "sha": "26a949df1613a6d9d792b32f7ad42ec95e5f0b64", "authors": ["Petteri Harjulehto", "Peter Ast\u00a8o", "Ast\u00a8 Ast\u00a8o", "Mika Koskenoja"], "doi": null, "journal": "Georgian Mathematical Journal"}
+{"title": "Pore-scale micro-computed-tomography imaging: Nonwetting-phase cluster-size distribution during drainage and imbibition", "sha": "26f9363b139174594fea956730dc5b4ae2b90fc5", "authors": ["A Georgiadis", "S Berg", "A Makurat", "G Maitland", "H Ott"], "doi": "10.1103/physreve.88.033002", "journal": "Physical Review E"}
+{"title": "THE ASYMPTOTIC EXPANSIONS FOR THE QDD PERIODIC MATHIEU FUNCTIONS", "sha": "27cf0da5ba688134f64f1da1982995f0aef4ef94", "authors": ["Gertrude Blanch"], "doi": null, "journal": null}
+{"title": "On-body TOA-based ranging error model for motion capture applications within wearable UWB networks", "sha": "27ea90b29f1c4224326cab44e473515cbb957a03", "authors": ["Jihad Hamie", "Benoit Denis", "Raffaele D\u2019errico", "Cedric Richard"], "doi": "10.1007/s12652-013-0215-6", "journal": "Journal of Ambient Intelligence and Humanized Computing"}
+{"title": null, "sha": "282637f682d8bc98e0193b5b6382806495b4fe87", "authors": [], "doi": null, "journal": null}
+{"title": "L\u00e9vy flights in human behavior and cognition", "sha": "2837fd59bb8e792b397ae5643d0f3b51f8e66c3a", "authors": ["Andrea Baronchelli", "Filippo Radicchi"], "doi": "10.1016/j.chaos.2013.07.013", "journal": "Chaos, Solitons & Fractals"}
+{"title": "Exercise training restores the cardiac microRNA-1 and \u2212214 levels regulating Ca2+ handling after myocardial infarction", "sha": "283d6dffbaba150f637d391f43f91c9b29e47ed1", "authors": ["St\u00e9phano Melo", "Val\u00e9rio Barauna", "Vander Neves", "Tiago Fernandes", "Lucienne Lara", "Diego Mazzotti", "Edilamar Oliveira"], "doi": "10.1186/s12872-015-0156-4", "journal": "BMC Cardiovascular Disorders"}
+{"title": "Why Pleiotropic Interventions are Needed for Alzheimer's Disease", "sha": "288182c252807745fd6bcca68972172e80cf1662", "authors": ["Sally Frautschy", "Greg Cole"], "doi": "10.1007/s12035-010-8137-1", "journal": "Molecular Neurobiology"}
+{"title": "Evaporation from Nonvegetated Surfaces: Surface Aridity Methods and Passive Microwave Remote Sensing", "sha": "28e234631d3935d9567401a10aef5be743435699", "authors": ["Anthony Cahill", "Marc Parlange", "Thomas Jackson", "Peggy O&apos;neill", "T Schmugge"], "doi": null, "journal": null}
+{"title": "State and disturbance estimation for nonlinear systems affine in the unmeasured variables", "sha": "2972f2670c22fcd3f55410085b58efb5d2e4ebfa", "authors": ["Michael Kurtz", "Michael Henson"], "doi": null, "journal": "Computers Chem. Engng"}
+{"title": "IRG/WP 08-40421 THE INTERNATIONAL RESEARCH GROUP ON WOOD PROTECTION Section 4 Processes and Properties Variations of Furfuryl alcohol and Wolmanit CX-8 treatability of pine sapwood within and between trees Variations of Furfuryl alcohol and Wolmanit CX-8 treatability of pine sapwood within and between trees", "sha": "2984f253115ac947b1f7c0e602e6b34efcd00af8", "authors": ["Erik Larn\u00f8y", "Stig Lande", "Geir Vest\u00f8l", "Erik Larn\u00f8y", "Stig Lande", "Geir Vest\u00f8l"], "doi": null, "journal": null}
+{"title": "Neural correlates of stopping and self-reported impulsivity", "sha": "29d21d871eab9b16fdbe40ebe6aae59d90d45f6d", "authors": ["Marieke Lansbergen", "Koen B\u00f6cker", "Evelijne Bekker", "J Kenemans"], "doi": "10.1016/j.clinph.2007.06.011", "journal": "Clinical Neurophysiology"}
+{"title": null, "sha": "2a4d96709aa4648c8a6c919c37c71d7d7693e5c0", "authors": [], "doi": null, "journal": null}
+{"title": "ON THE STABILITY OF CHOLESKY FACTORIZATION FOR SYMMETRIC QUASIDEFINITE SYSTEMS*", "sha": "2a4e5e873553f52aa337a5c5228b409b7d061028", "authors": ["Philip Gillt", "Michael Saunders$", "Joseph Shinnerlt"], "doi": null, "journal": "SIAM J. MATRIX ANAL. APPL"}
+{"title": "Revolucion\u00e1rio e ainda assim desconhecido! (Weltbewegend und doch unbekannt!)", "sha": "2a732b5790204fab9e57808dd63968b89bca9948", "authors": ["Peter Schuster"], "doi": null, "journal": null}
+{"title": "Dolor como factor predictor de depresi\u00f3n en el paciente oncol\u00f3gico: estudio de casos y controles. Estudio D-PRESS", "sha": "2a7b91aec818987571d3758727c1ccd2ddf055f3", "authors": ["J Carulla", "C Jara", "J Sanz", "C Mart\u00ednez", "F Ledesma", "E Zubillaga", "Carulla", "Jara", "Sanz", "Mart\u00ednez Ledesma", "F", "Zubi-Llaga Dolor", "La Creu", "San Pau", "Barcelona", "E Jim\u00e9nez", "J Avanza", "A Vaz", "Madrid Cajal", "P D\u00edaz", "Germans Trias", "I Pujol", "Barcelona", "J Garc\u00eda"], "doi": null, "journal": null}
+{"title": "Robust Web Services Provisioning Through On-Demand Replication", "sha": "2a9173cfcd7338e1b965db40e38f0851fe6030d4", "authors": ["Quan Sheng", "Zakaria Maamar", "Jian Yu", "Anne Ngu"], "doi": null, "journal": null}
+{"title": "Polarization variability in magnetic white dwarfs GD 229 and G 240-72", "sha": "2ac47096e40d92ad595787c24c56a5fd214ec66a", "authors": ["A Berdyugin", "V Piirola"], "doi": null, "journal": "Astron. Astrophys"}
+{"title": "Tripartite motif-containing 22 inhibits the activity of hepatitis B virus core promoter, which is dependent on nuclear-located RING domain", "sha": "2ada09b4b4602b697c81180363a90a24fc6b7624", "authors": ["Bo Gao", "Zhijian Duan", "Wei Xu", "Sidong Xiong"], "doi": "10.1002/hep.23011", "journal": "Hepatology"}
+{"title": "Dual control of Shuanghuang Shengbai granule on upstream and downstream signal modulators of CyclinD-CDK4/6 signaling pathway of cell cycle in Lewis-bearing mice with cyclophosphamide-induced myelosuppression", "sha": "2b17f9d7f6e34095fc73d186d743439bab5b7121", "authors": ["Zhen-Ye Xu", "Xian-Gu", "Ling Yu-Zhu", "Li-Fang Wang", "Kai-Li", "Qiang-Pei"], "doi": "10.2147/ott.s37407", "journal": "OncoTargets and Therapy"}
+{"title": "A Service of zbw Are specific skills an obstacle to labor market adjustment?", "sha": "2b8821a952291b59ab956b134c71fdb9f029f776", "authors": [], "doi": null, "journal": null}
+{"title": "Insufficiency of Linear Coding in Network Information Flow", "sha": "2c80ca4786e43d4af0516e43662187c4652c048b", "authors": ["Randall Dougherty", "Christopher Freiling", "Kenneth Zeger"], "doi": null, "journal": null}
+{"title": "Lanchester\u2019s equations in three dimensions", "sha": "2cb62e2222040f0e6ff928fac3f598047347277e", "authors": ["Christina Spradlin", "Greg Spradlin"], "doi": "10.1016/j.camwa.2007.01.013", "journal": "Computers & Mathematics with Applications"}
+{"title": "Mechanisms and variation in plant development: sorting the wood from the trees in Vermont", "sha": "2cdc41c9aa3388221ad0189450d24e379fa33ab0", "authors": ["George Coupland"], "doi": "10.1242/dev.027383", "journal": "Development"}
+{"title": null, "sha": "2d26fcdce614c53747f0a301aaef33eeb2ce9b34", "authors": [], "doi": null, "journal": null}
+{"title": "Construction of BIBAC and BAC libraries from a variety of organisms for advanced genomics research", "sha": "2d7193e7ce2215d48a26b1566fa0cb814c2df26b", "authors": ["Hong-Bin Zhang", "Chantel Scheuring", "Meiping Zhang", "Yang Zhang", "Cheng-Cang Wu", "Jennifer Dong", "Yaning Li"], "doi": "10.1038/nprot.2011.456", "journal": "Nature Protocols"}
+{"title": "Privacy Preserving Techniques in Social Networks Data Publishing-A Review", "sha": "2daefa6ea16aadf6405a1656af4b088c05c85a3b", "authors": ["Amardeep Singh", "Scholar", "Divya Bansal", "Sanjeev Sofat"], "doi": null, "journal": "International Journal of Computer Applications"}
+{"title": "A Service of zbw Leibniz-Informationszentrum Wirtschaft Leibniz Information Centre for Economics", "sha": "2dfa559a1a261af0d55069ffe17a6835ac78abe7", "authors": ["Stephan Heblich", "Oliver Falck", "Christina G\u00fcnther", "William Kerr"], "doi": null, "journal": null}
+{"title": "The Adequacy of Speculation in Agricultural Futures Markets: Too Much of a Good Thing", "sha": "2e0859baebaf70574a78ef0f7d7736852edb7066", "authors": ["Dwight Sanders", "Scott Irwin", "Robert Merrin", "D Sanders", "S Irwin", "R Merrin"], "doi": null, "journal": "Proceedings of the NCCC-134 Conference on Applied Commodity Price Analysis, Forecasting, and Market Risk Management"}
+{"title": "93. MECHANISM OF BIOLOGICAL NITROGEN FIXATION 7. MOLECULAR H2 AND THE pN2 FUNCTION OF AZOTOBACTER1", "sha": "2eeaafb756aa5d3e4c051a5c2b67ac68b1020458", "authors": ["Orville Wyss", "C Lind", "J Wilson", "P Wilson"], "doi": null, "journal": null}
+{"title": "Human Visual System Integrates Color Signals along a Motion Trajectory", "sha": "2ef0bfa7c7b0bff2b8179fba07bc11dbb0d9e729", "authors": ["Shin&apos;ya Nishida", "Junji Watanabe", "Ichiro Kuriki", "Toyotaro Tokimoto"], "doi": "10.1016/j.cub.2006.12.041", "journal": "Current Biology"}
+{"title": "The origins of the welfare state in Portugal: the new frontiers between public and private*", "sha": "2f1f1d6a8b0600da9d27f3d557824716f0e2d97c", "authors": ["Miriam Halpern", "Pereira Iscte"], "doi": "10.1386/pjss.4.1.3/1", "journal": "Portugese Journal of Social Sciences"}
+{"title": "A Consolidated DaaS Model for Situation-Informed Incident Management", "sha": "2f20f3d0db467fc77eb19689c91091c89268089d", "authors": ["Nan Jiang", "Lai Xu", "Paul De Vrieze"], "doi": null, "journal": "IFIP AICT"}
+{"title": "On the Helmholtz Principle for Data Mining [Fulltext] Approved for External Publication Internal [Fulltext] On the Helmholtz Principle for Data Mining", "sha": "2f298b56b79c50e6a86e7861138ae84a2256ad96", "authors": ["Alexander Balinsky", "Helen Balinsky", "Steven Simske", "Alexander Balinsky", "Helen Balinsky", "Steven Simske"], "doi": null, "journal": null}
+{"title": "O Corpo Despido pelas Pr\u00e1ticas de Desenhar: dos usos \u00e0 disciplinariza\u00e7\u00e3o do desenho * The Uncovered Human Body by Practices of Drawing: from the uses to the disciplining of drawing", "sha": "2f4233b6f7fda830910139e7126578d0e148fd39", "authors": ["Rio Bolema", "Claro"], "doi": null, "journal": null}
+{"title": "A Service of zbw Leibniz-Informationszentrum Wirtschaft Leibniz Information Centre for Economics", "sha": "2f4c1043920382b288e59cd88a3357bf0432fb01", "authors": ["Mikl\u00f3s-Thal", "Jeanine Ullrich", "Hannes"], "doi": null, "journal": null}
+{"title": "Answering Regular Path Queries Using Views", "sha": "2f5b3b4d8523087a77cbb471e9a5c340e9e1fa6e", "authors": ["Diego Calvanese", "Giuseppe De", "Giacomo", "Maurizio Lenzerini", "\u00bd", "Moshe Vardi"], "doi": null, "journal": null}
+{"title": "Analysis of Matrine Alkaloids in Human Urine by Hollow Fiber Liquid-phase Microextraction with High-performance Liquid Chromatography", "sha": "2f5b84ff72bf2b1c92d3f3e439d6f5722c6b179c", "authors": ["Dan-Dan Han", "Kyung-Ho Row"], "doi": "10.5012/jkcs.2010.54.01.038", "journal": "Journal of the Korean Chemical Society"}
+{"title": "Space-variant Active Vision: Definition, Overview and Examples", "sha": "2f95e02f6c3b252de0fd7f8e67e3a62f56c0a8d3", "authors": ["Eric Schwartz", "Douglas Greve", "Giorgio Bonmassar"], "doi": null, "journal": "Neural Networks"}
+{"title": "A Functional Analysis of Sensory Units Innervating Epiglottis and Larynx", "sha": "2ff3fdda6892f35b447e6137e93f3029299518e0", "authors": ["Arthur Storey"], "doi": null, "journal": null}
+{"title": "Nemirovski's Inequalities Revisited", "sha": "2ffbaf317f0d7fa3813721756f82ea7e47555af1", "authors": ["Lutz D\u00fcmbgen", "Sara A. Van De Geer", "Mark C. Veraar", "Jon A. Wellner"], "doi": "10.4169/000298910x476059", "journal": "The American Mathematical Monthly"}
+{"title": "The first performance report for the Bio-Rad Dx CT/NG/MG assay for simultaneous detection of Chlamydia trachomatis, Neisseria gonorrhoeae and Mycoplasma genitalium in urogenital samples", "sha": "304abcd565e3b5bcc92c012b44b56aa3384c2c44", "authors": ["Chlo\u00e9 Le Roy", "Isabelle Le Hen", "Ma\u00efth\u00e9 Clerc", "V\u00e9ronique Arfel", "Fran\u00e7oise Normandin", "C\u00e9cile B\u00e9b\u00e9ar", "Bertille De Barbeyrac"], "doi": "10.1016/j.mimet.2012.03.009", "journal": "Journal of Microbiological Methods"}
+{"title": "Uniqueness/nonuniqueness for nonnegative solutions of second-order parabolic equations of the form ut=Lu+Vu\u2212\u03b3up in Rn", "sha": "30b7f709cc3d9a42b10c29fecda07d9e86c487ed", "authors": ["J\u00e1nos Engl\u00e4nder", "Ross Pinsky"], "doi": "10.1016/s0022-0396(03)00089-5", "journal": "Journal of Differential Equations"}
+{"title": "Automated segmentation of hepatic vessels in non-contrast X-ray CT images", "sha": "30e5fb58c61780dff1b8ec0e5771c65afc4f824d", "authors": ["Suguru Kawajiri", "Xiangrong Zhou", "Xuejun Zhang", "Takeshi Hara", "Hiroshi Fujita", "Ryujiro Yokoyama", "Hiroshi Kondo", "Masayuki Kanematsu", "Hiroaki Hoshi"], "doi": "10.1007/s12194-008-0031-4", "journal": "Radiological Physics and Technology"}
+{"title": "MCMC-BASED PEAK TEMPLATE MATCHING FOR GCXGC", "sha": "315b26e4648f3daa1c455959efeb0cf25eabc4b0", "authors": ["Mingtian Ni", "Qingping Tao", "Stephen Reichenbach"], "doi": null, "journal": null}
+{"title": null, "sha": "31fffb87d80647a955eb5357362826f76571c68f", "authors": [], "doi": null, "journal": null}
+{"title": "Three-State Conical Intersections in Nucleic Acid Bases", "sha": "3248056d9c214290579be25664e1a2938b71310d", "authors": ["Spiridoula Matsika"], "doi": "10.1021/jp0513622", "journal": "The Journal of Physical Chemistry A"}
+{"title": "3A6.3 Correlative Coding with Clipping and Filtering Technique in OFDM Systems", "sha": "3262fa93df27937cec4ea9606dd6b7b63d51a111", "authors": ["Sharifah Yusof", "Norsheila Fisal"], "doi": null, "journal": null}
+{"title": "Mathematical modeling of degradation for bulk-erosive polymers: Applications in tissue engineering scaffolds and drug delivery systems", "sha": "32b049910bb77ab8663af2698fc5449f7e3d49fd", "authors": ["Yuhang Chen", "Shiwei Zhou", "Qing Li"], "doi": "10.1016/j.actbio.2010.09.038", "journal": "Acta Biomaterialia"}
+{"title": "Durability of masonry systems: A laboratory study", "sha": "3321de8121a8f4904459491bff77b8c81c672da0", "authors": ["G Cultrone", "E Sebasti\u00e1n", "M Huertas"], "doi": "10.1016/j.conbuildmat.2005.07.008", "journal": "Construction and Building Materials"}
+{"title": "Chapter Title: The Influence of Income Tax Rules on Insurance Reserves", "sha": "33423950cccbc8f876ffcd72162e64b422666032", "authors": ["David Bradford", "Kyle Logue"], "doi": null, "journal": null}
+{"title": "P1 Comparison of carbamylated versus recombinant erythropoietin during spinal cord ischemia/reperfusion injury P2 Sodium 4-phenylbutylate protects against myocardial ischemia-reperfusion injury by reducing unfolded protein response-mediated apoptosis in mice P3 Time-dependent eff ects of intravenous H 2 S during long-term, resuscitated porcine hemorrhagic shock", "sha": "3343a575447995681ac961f66d83c5768abdd28c", "authors": ["F Simon", "A Scheuerle", "A Soell", "M Groeger", "O Mccook", "P Radermacher", "M Okajima", "M Takamura", "S Usui", "T Taniguchi", "S Kaneko", "H Bracht", "F Simon", "B Hauser", "M Groeger", "A Soell", "O Mccook", "M Georgieff", "P Radermacher", "C Szabo", "E Calzia"], "doi": null, "journal": "Critical Care"}
+{"title": "Um Modelo Hier\u00e1rquico de An\u00e1lise das Vari\u00e1veis S\u00f3cio-Econ\u00f4micas e dos Padr\u00f5es de Contatos com \u00c1guas Associados \u00e0 Forma Hepatoespl\u00eanica da Esquistossomose 1", "sha": "33ed802d62c93c38888fe986e8eaca174740f343", "authors": ["Cad", "Sa\u00fade P\u00fabl", "Rio De Janeiro"], "doi": null, "journal": null}
+{"title": "Effects of genetic polymorphisms of UCP2 and UCP3 on very low calorie diet-induced body fat reduction in Korean female subjects", "sha": "3423c92d5d89568abb80775c1b4c99736019e0c5", "authors": ["Yoosik Yoon", "Byung Park", "Min Cha", "Kil Kim", "Hyun Cheong", "Yoo Choi", "Hyoung Shin"], "doi": "10.1016/j.bbrc.2007.05.110", "journal": "Biochemical and Biophysical Research Communications"}
+{"title": "Conceptual Design of Data Warehouses from E/R Schemes", "sha": "345109f4668e8e5aed37606fee9f23ea6c30cc65", "authors": ["Matteo Golfarelli", "Dario Maio", "Stefano Rizzi"], "doi": null, "journal": "Proceedings of the Hawaii International Conference On System Sciences"}
+{"title": "Congenital Left Paraduodenal Hernia Causing Chronic Abdominal Pain and Abdominal Catastrophe", "sha": "34958f9d0a47436f538ba2f263ff03626d551f04", "authors": ["Y Shi", "A Felsted", "P Masand", "B Mothner", "J Nuchtern", "J Rodriguez", "S Vasudevan"], "doi": "10.1542/peds.2014-3701", "journal": "PEDIATRICS"}
+{"title": "PackBot: A Versatile Platform for Military Robotics", "sha": "34a748a0d60a6ff9fab3d0edbee1cd1b8621a5b8", "authors": ["Brian Yamauchi"], "doi": null, "journal": null}
+{"title": "BRAZILIAN ARCHIVES OF BIOLOGY AND TECHNOLOGY A N I N T E R N A T I O N A L J O U R N A L Population Structure and Somatic Indexes of Hypostomus cf. ancistroides (Siluriformes, Loricariidae) Collected from the Bonito River, Iva\u00ed River Basin, Turvo, Paran\u00e1", "sha": "34c77a534fe795f55b949a6f4ce4a8e76ce9ac61", "authors": ["Douglas Viana", "Luciano Wollf", "T\u00e2nia Zaleski", "Silvia Rom\u00e3o", "Gustavo Bertoldi", "Luc\u00e9lia Donatti"], "doi": null, "journal": "Braz. arch. biol. technol. v"}
+{"title": "Specimen Holders for Conducting Dynamic Mechanical Property Tests", "sha": "354ef362d55b520bdc7e79b88b65b4a1ec92cb59", "authors": [], "doi": null, "journal": null}
+{"title": "Dynamics of arachidonic acid mobilization by inflammatory cells", "sha": "359dec969545283d8a587bb7a13cb8bca6625c0e", "authors": ["Alma Astudillo", "David Balgoma", "Mar\u00eda Balboa", "Jes\u00fas Balsinde"], "doi": "10.1016/j.bbalip.2011.11.006", "journal": "Biochimica et Biophysica Acta (BBA) - Molecular and Cell Biology of Lipids"}
+{"title": "Prognostic markers in cancer: the evolution of evidence from single studies to meta-analysis, and beyond", "sha": "359e3691c11222bd8dc479bafc2d36f5bb34697e", "authors": ["R Riley", "W Sauerbrei", "D Altman"], "doi": "10.1038/sj.bjc.6604999", "journal": "British Journal of Cancer"}
+{"title": "A multi-resolution assessment of the Community Multiscale Air Quality (CMAQ) model v4.7 wet deposition estimates for 2002\u20132006", "sha": "35fc6ebd65003bb8375f8438b39e8babacbb3ae8", "authors": ["K Appel", "K Foley", "J Bash", "R Pinder", "R Dennis", "D Allen", "K Pickering"], "doi": "10.5194/gmd-4-357-2011", "journal": "Geoscientific Model Development"}
+{"title": "A Comparison of Supervised Machine Learning Techniques for Predicting Short-Term In-Hospital Length of Stay among Diabetic Patients", "sha": "3614e6b9313b0093bda33b719c28708637237323", "authors": ["April Morton", "Eman Marzban", "Georgios Giannoulis", "Ayush Patel", "Rajender Aparasu", "Ioannis Kakadiaris"], "doi": "10.1109/icmla.2014.76", "journal": "2014 13th International Conference on Machine Learning and Applications"}
+{"title": "Filamin A controls matrix metalloproteinase activity and regulates cell invasion in human fibrosarcoma cells", "sha": "362c2c71c7cbb740afee65c82451241336cb727d", "authors": ["M Baldassarre", "Z Razinia", "N N. Brahme", "R Buccione", "D Calderwood"], "doi": "10.1242/jcs.104018", "journal": "Journal of Cell Science"}
+{"title": "Do interest rates matter? Credit demand in the Dhaka slums * DO INTEREST RATES MATTER? CREDIT DEMAND IN THE DHAKA SLUMS", "sha": "367c542b8d4073c808b87d448e14b113d4766bab", "authors": ["Rajeev Dehejia", "Heather Montgomery", "Jonathan Morduch"], "doi": null, "journal": null}
+{"title": "Palonosetron-induced migraine-type headache", "sha": "375bf4bb5419bd7cf05b37d5049257e780682c91", "authors": ["Amit Jain", "A Jain", "A Jain"], "doi": "10.1007/s12630-010-9422-1", "journal": "Canadian Journal of Anesthesia/Journal canadien d'anesth\u00e9sie"}
+{"title": "Electric stunning of trout: power reduction using a two-stage stun", "sha": "37970ff7ab8f7133b6200ae82398a27b436c0ec1", "authors": ["Jeff Lines", "Steve Kestin"], "doi": "10.1016/j.aquaeng.2004.09.007", "journal": "Aquacultural Engineering"}
+{"title": "Stability and Control of Relative Equilibria of Three-Spacecraft Magnetically Tethered Systems", "sha": "37ae761b978b1eda3dc513aec9fd6445d2ba1eb9", "authors": ["I Hussein", "A Bloch"], "doi": null, "journal": null}
+{"title": "Asymmetric mapping from phonetic to lexical representations in second-language listening", "sha": "37bdf04c5ce5303e3ecd9c1ad6e6a8836c6df4f2", "authors": ["Anne Cutler", "Andrea Weber", "Takashi Otake"], "doi": "10.1016/j.wocn.2005.06.002", "journal": "Journal of Phonetics"}
+{"title": "Formulating State Space Models in R with Focus on Longitudinal Regression Models", "sha": "38ae4a33cef875631f5260b4d59523972815d73b", "authors": ["Claus Dethlefsen", "S\u00f8ren Lundbye-Christensen"], "doi": null, "journal": "JSS Journal of Statistical Software"}
+{"title": "Spectral Clustering of Shape and Probability Prior Models for Automatic Prostate Segmentation", "sha": "393fba8e909430ca84977b0890c932883f27c978", "authors": ["S Ghose", "J Mitra", "A Oliver", "R Mart\u00ed", "X Llad\u00f3", "J Freixenet", "J Vilanova", "J Comet", "D Sidib\u00e9", "F Meriaudeau"], "doi": null, "journal": null}
+{"title": "A Lock-Free, Concurrent, and Incremental Stack Scanning for Garbage Collectors", "sha": "395f08b05f1c83b68ac7c4c9ddf6bdcb32c4f9f0", "authors": ["Gabriel Kliot", "Erez Petrank", "Bjarne Steensgaard"], "doi": null, "journal": null}
+{"title": "Posterior probability maps and SPMs", "sha": "39d983858ba96f01e66f46143a7eb8efe77d55bb", "authors": ["K Friston", "W Penny"], "doi": "10.1016/s1053-8119(03)00144-7", "journal": "NeuroImage"}
+{"title": "Gold Nanobridge Stabilized by Surface Structure", "sha": "3a1a4a5b19be1e13f178ae11159a629b7c128788", "authors": ["Yukihito Kondo", "Kunio Takayanagi"], "doi": null, "journal": null}
+{"title": "Social organization in a flatworm: trematode parasites form soldier and reproductive castes", "sha": "3ad7cbb81fafd36776e909783130a73bbe690562", "authors": ["R Hechinger", "A Wood", "A Kuris"], "doi": "10.1098/rspb.2010.1753", "journal": "Proceedings of the Royal Society B: Biological Sciences"}
+{"title": "Drag effect of triple junctions on grain boundary and grain growth kinetics in aluminium", "sha": "3b077490a775f72b58758a5bfe3e867936b19323", "authors": ["D Mattissen", "D Molodov", "L Shvindlerman", "G Gottstein"], "doi": "10.1016/j.actamat.2005.01.016", "journal": "Acta Materialia"}
+{"title": "TCP PERFORMANCE IMPROVEMENT OVER WIRELESS ATM NETWORKS THROUGH A NEW AAL PROTOCOL", "sha": "3b2a0f364ef96e4e6e83cbdb43157c09dfff47fd", "authors": ["Ian Akyildiz", "Inwhee Joe"], "doi": null, "journal": null}
+{"title": "Palladium-Catalyzed Tetraarylation of 5,15-Dialkylporphyrins with Aryl Bromides", "sha": "3ba3f231382152a1365e861f1ba7efba44b6febf", "authors": ["Atsuhiro Osuka", "Hideki Yorimitsu", "Yutaro Yamamoto", "Sumito Tokuji", "Takayuki Tanaka"], "doi": "10.3987/com-13-s(s)5", "journal": "HETEROCYCLES"}
+{"title": "Ankle Anatomy for the Arthroscopist. Part I: The Portals", "sha": "3e8803ffdbed2fe90ede5d24836cfcbb88a2cccc", "authors": ["Pau Golan\u00f3", "Jordi Vega", "Luis P\u00e9rez-Carro", "V\u00edctor G\u00f6tzens"], "doi": "10.1016/j.fcl.2006.03.005", "journal": "Foot and Ankle Clinics"}
+{"title": "Cells, Gels, and the Engines of Life", "sha": "3ebf650663187b291c2033b2abb824f9adcea9f0", "authors": ["Gerald Pollack"], "doi": null, "journal": null}
+{"title": "CANONICAL SYSTEM ON ELLIPTIC CURVES", "sha": "3f3b73739cc3c2d568d63d921743aed3560add21", "authors": ["Luis Piovan"], "doi": null, "journal": "Mathematics Subject Classification. Primary 58F07"}
+{"title": "Observations of Pc 1-2 Waves in the Outer Magnetosphere", "sha": "3f5edd4c0905c59ab829d5f875629533a40b3667", "authors": ["Stanley Kaye", "Margaret Kivelson"], "doi": null, "journal": "NO. A8 JOURNAL OF GEOPHYSICAL RESEARCH AUGUST"}
+{"title": null, "sha": "3f9085123e1880ba6365abe6f0f40e059142f4f7", "authors": [], "doi": null, "journal": null}
+{"title": "Adaptive Content Presentation for the Web", "sha": "402a43745567aa0fe0dcd6bf3aa1eb0b21335098", "authors": ["P Brusilovsky", "A"], "doi": null, "journal": "LNCS"}
+{"title": "Sonoluminescence from Alkali-Metal Salt Solutions", "sha": "409727f7ced48b37f73fd6ae55fcd649798ac0a7", "authors": ["Edward Flint", "Kenneth Suslick"], "doi": null, "journal": "J. Phys. Chem"}
+{"title": "Flow-based Scatterplots for Sensitivity Analysis", "sha": "40a48c89bf9c5b672b335a48a3276a491804cf26", "authors": ["Yu-Hsuan Chan", "Carlos Correa", "Kwan-Liu Ma"], "doi": null, "journal": null}
+{"title": "Immobilization of invertase in conducting polymer matrices", "sha": "40d91ff7d41ef3d5421c322abc15e8ef3b845aa8", "authors": ["F Selampinar", "U Akbulut", "M Ozden", "L Toppare+"], "doi": null, "journal": "Biomaferids"}
+{"title": "A facile solvent-free synthesis route for the assembly of a highly CO2selective and H2S tolerant NiSIFSIX metal\u2013organic framework", "sha": "413aea4381568571749ada231ae90868ea64db2e", "authors": ["Osama Shekhah", "Youssef Belmabkhout", "Karim Adil", "Prashant Bhatt", "Amy Cairns", "Mohamed Eddaoudi"], "doi": "10.1039/c5cc04487a", "journal": "Chem. Commun."}
+{"title": "Optimal Bitwise Register Allocation using Integer Linear Programming", "sha": "4174ccdcc1b314ce1350b84cb37ed0e20ca48dee", "authors": ["Rajkishore Barik", "Christian Grothoff", "Rahul Gupta", "Vinayaka Pandit", "Raghavendra Udupa"], "doi": null, "journal": null}
+{"title": "Vigil\u00e2ncia Sanit\u00e1ria: uma proposta de an\u00e1lise dos contextos locais Sanitary Surveillance: a proposal for analyzing local environments", "sha": "41cf1a86e323d448523bc0e89f87afe9cc77769a", "authors": ["M\u00e1rcia Piovesan", "Maria Val\u00e9ria", "Vasconcelos Padr\u00e3o", "Maria Dumont", "Gracia Gondim", "Oviromar Flores", "Jos\u00e9 Pedrosa", "Luiz Felipe", "Moreira Lima"], "doi": null, "journal": "Rev Bras Epidemiol"}
+{"title": "Fractional rotational diffusion of rigid dipoles in an asymmetrical double-well potential", "sha": "41f71b5c93014a12d7ac020ca880fd1913b78d7e", "authors": ["William Coffey", "Yuri Kalmykov", "Sergey Titov", "Jagdish Vij"], "doi": "10.1103/physreve.72.011103", "journal": "Physical Review E"}
+{"title": "Biometric Applications Related to Human Beings: There Is Life beyond Security", "sha": "41fd1cd5e78ae710b67fd17ebe52b8e182f4b02e", "authors": ["Marcos Faundez-Zanuy", "Amir Hussain", "Jiri Mekyska", "Enric Sesa-Nogueras", "Enric Monte-Moreno", "Anna Esposito", "Mohamed Chetouani", "Josep Garre-Olmo", "Andrew Abel", "Zdenek Smekal", "Karmele Lopez-De-Ipi\u00f1a"], "doi": "10.1007/s12559-012-9169-9", "journal": "Cognitive Computation"}
+{"title": "Self-Stabilizing Structured Ring Topology P2P Systems", "sha": "4216191d4d5551a43228c60afaa79c62f1b07850", "authors": ["Ayman Shaker", "Douglas Reeves"], "doi": null, "journal": null}
+{"title": "Highly flexible supercapacitors with manganese oxide nanosheet/carbon cloth electrode", "sha": "426bf3b3614659fc002b0a93a249840be130f73f", "authors": ["Ying-Chu Chen", "Yu-Kuei Hsu", "Yan-Gu Lin", "Yu-Kai Lin", "Ying-Ying Horng", "Li-Chyong Chen", "Kuei-Hsien Chen"], "doi": "10.1016/j.electacta.2011.05.090", "journal": "Electrochimica Acta"}
+{"title": "Transport of Bunched Beams with Space Charge through a Periodic Lattice", "sha": "42adebc7bc21eb93e6e7df07e27008d9d9226a2c", "authors": ["M Reusch", "D Bruhwiler"], "doi": null, "journal": null}
+{"title": "Mixture of Trees Probabilistic Graphical Model for Video Segmentation", "sha": "432186760fe45c7f535a2c00bbb2b9ad77f136db", "authors": ["Vijay Badrinarayanan", "Ignas Budvytis", "Roberto Cipolla", "V Badrinarayanan", "I Budvytis", "\u00b7 Cipolla", "I Budvytis", "R Cipolla"], "doi": null, "journal": null}
+{"title": null, "sha": "4360ef549be2d8d1072df112b570549f24e8a2d2", "authors": [], "doi": null, "journal": null}
+{"title": "The Trajectory of Wealth in Retirement", "sha": "446ff148066cbca1cfa411a45bf25f5a96f47253", "authors": ["David Love", "Michael Palumbo", "Paul Smith"], "doi": null, "journal": null}
+{"title": "Gastric splenosis: a rare cause of digestive bleeding", "sha": "4481baee7f668a0c6524516d788c27fbd7bf4e0d", "authors": ["Bruno Arroja", "Nuno Almeida", "Charl Macedo", "Ana Moreira", "Pedro Oliveira", "Luis Tom\u00e9", "Hermano Gouveia", "Carlos Sofia"], "doi": null, "journal": "REV ESP ENFERM DIG (Madrid)"}
+{"title": "The information from muon arrival time distributions of high-energy EAS as measured with the KASCADE detector KASCADE Collaboration", "sha": "44bfe48de81ca32666068a6003ccb83c25b6e44e", "authors": ["T Antoni", "W Apel", "A Badea", "K Bekk", "A Bercuci", "H Bl\u20ac U Umer", "H Bozdog", "I Brancus", "C B\u20ac", "A Chilingarian", "K Daumiller", "P Doll", "J Engler", "F Fessler", "H Gils", "R Glasstetter", "R Haeusler", "A Haungs", "D Heck", "J H\u20ac O Orandel", "A Iwan", "K Kampert", "H Klages", "G Maier", "H Mathes", "H Mayer", "J Milke", "M M\u20ac U Uller", "R Obenland", "J Oehlschl\u20ac", "S Ostapchenko", "M Petcu", "H Rebel", "M Risse", "M Roth", "G Schatz", "H Schieler", "J Scholz", "T Thouw", "H Ulrich", "J Weber", "A Weindl", "J Wentz", "J Wochele", "J Zabierowski", "A Badea", ")"], "doi": null, "journal": null}
+{"title": "The class I bHLH factors E2-2A and E2-2B regulate EMT", "sha": "45ac32bf7a6ad8f03f3a5a63d0df3a27dca981fc", "authors": ["V Sobrado", "G Moreno-Bueno", "E Cubillo", "L Holt", "M Nieto", "F Portillo", "A Cano"], "doi": "10.1242/jcs.028241", "journal": "Journal of Cell Science"}
+{"title": "IJCA Special Issue on \"Evolutionary Computation for Optimization Techniques", "sha": "45c8e3e851fb51c4b8d8741c9d5cda26bec538fb", "authors": ["Tessy Varghese"], "doi": null, "journal": "ECOT"}
+{"title": "Title A quadratic assumed natural strain curved triangular shell element", "sha": "45effe3e1a56a2d791f00c1ae8730f5c0d7f4f03", "authors": ["Ky Sze", "Zhu"], "doi": null, "journal": "Computer Methods In Applied Mechanics And Engineering"}
+{"title": "Managing Disruptions in Decentralized Supply Chains with Endogenous Supply Process Reliability", "sha": "461ec751d6168f63555b157575f0912411685ee4", "authors": ["Sammi Tang", "Haresh Gurnani", "Diwakar Gupta"], "doi": "10.1111/poms.12160", "journal": "Production and Operations Management"}
+{"title": "Implementation and Utilization of a Heterogeneous Multicomputer Cluster for the Study of Load Balancing Strategies", "sha": "466390872d14ccb22865be037558e46a9ca72293", "authors": ["Per Andersen", "John Antonio"], "doi": null, "journal": null}
+{"title": "A Survey and Classification of Storage Deduplication Systems", "sha": "46a574413123beb2ba0572c563e1a4883baec997", "authors": ["Jo\u00e3o Paulo", "Jos\u00e9 Pereira"], "doi": "10.1145/2611778", "journal": "ACM Computing Surveys"}
+{"title": "The deep phylogeny of jumping spiders (Araneae, Salticidae)", "sha": "46a58d6350c445f22d30637f3e8cb60e1fafebc3", "authors": ["Wayne Maddison", "Daiqin Li", "Melissa Bodner", "Junxia Zhang", "Xu Xin", "Qinqing Liu", "Fengxiang Liu"], "doi": "10.3897/zookeys.440.7891", "journal": "ZooKeys"}
+{"title": "SEARCHING FOR CHEMICAL SIGNATURES OF MULTIPLE STELLAR POPULATIONS IN THE OLD, MASSIVE OPEN CLUSTER NGC 6791", "sha": "46e8d05168f7dfe8ba4188bdb137c68af7d95287", "authors": ["Angela Bragaglia", "Christopher Sneden", "Eugenio Carretta", "Raffaele Gratton", "Sara Lucatello", "Peter Bernath", "James Brooke", "Ram Ram"], "doi": "10.1088/0004-637x/796/1/68", "journal": "The Astrophysical Journal"}
+{"title": "Fast replication of out-of-plane microlens with polydimethylsiloxane and curable polymer (NOA73)", "sha": "46ed1a2249005c2ff0992ccd04857227f83cce76", "authors": ["Guocheng Shao", "Weiping Qiu", "Wanjun Wang"], "doi": "10.1007/s00542-009-1010-3", "journal": "Microsystem Technologies"}
+{"title": "Characterization of the Serum from a Patient with Insulin Resistance and Hypoglycemia Evidence for Multiple Populations of Insulin Receptor Antibodies with Different Receptor Binding and Insulin-mimicking Activities", "sha": "473b48562854741b770a8402e45dc41ba3b03b54", "authors": ["Roberto De", "Richard Roth", "Luciano Rossetti", "Ira Goldfine", "Roberto De Pirro"], "doi": null, "journal": null}
+{"title": "Diagnosis of Physical Systems With Hybrid Models Using Parametrized Causality", "sha": "479f7d842d0b8b575f6617898c2c7fca586756bb", "authors": ["Pieter Mosterman"], "doi": null, "journal": null}
+{"title": "Routes of Infection: Exports and HIV Incidence in Sub-Saharan Africa", "sha": "479fd1c2a3e5b5a8c9db77441a73203fc6a5ee0e", "authors": ["Emily Oster"], "doi": null, "journal": null}
+{"title": "Optimal Distributed Coding Schemes for Energy Efficiency in the Fading Relay Channel", "sha": "47af4052e284b48819ab263eaddaed41ed3315a8", "authors": ["Fanny Parzysz\u00e9cole", "Mai Vu", "Fran\u00e7ois Gagnon\u00e9cole"], "doi": null, "journal": null}
+{"title": "Role of enteral nutrition in nonthyroidal illness syndrome: a retrospective observational study", "sha": "47b08697c19b66d56e29e95397dda1c086316be5", "authors": ["Ranran Li", "Jianan Ren", "Qin Wu", "Gefei Wang", "Xiuwen Wu", "Jun Chen", "Guanwei Li", "Zhiwu Hong", "Huajian Ren", "Yunzhao Zhao", "Jieshou Li"], "doi": "10.1186/s12902-015-0061-y", "journal": "BMC Endocrine Disorders"}
+{"title": "Resonances in rotationally inelastic scattering of OH(X 2 \u03a0) with helium and neon", "sha": "47b4c21e49caa748e2239e0ee83de281b866c3f1", "authors": ["Koos Gubbels", "Qianli Ma", "Millard Alexander", "Paul Dagdigian", "Dick Tanis", "Gerrit Groenenboom", "Ad Van Der Avoird", "Sebastiaan Van De Meerakker"], "doi": null, "journal": null}
+{"title": "DTI reveals structural differences in white matter tracts between bilingual and monolingual children", "sha": "47d523cb18a2bf4f18ccd6cfda5ca71903cc7288", "authors": ["Seyede Mohades", "Esli Struys", "Peter Van Schuerbeek", "Katrien Mondt", "Piet Van De Craen", "Robert Luypaert"], "doi": "10.1016/j.brainres.2011.12.005", "journal": "Brain Research"}
+{"title": "Os cientistas e seus arquivos Os cientistas e seus arquivos Scientists and their archives", "sha": "48071801b0f600b27da976eba9b7126459063d79", "authors": ["V", "Celso Castro"], "doi": null, "journal": null}
+{"title": "Antecedentes de las intenciones de abandono en cooperativas colombianas", "sha": "485bca3c99c404801c31e1a9e0d0765773ecd7f3", "authors": ["Juan Rom\u00e1n-Calder\u00f3n", "Adalgisa Battistelli", "Mario Vargas-Saenz"], "doi": "10.1590/s0034-759020140607", "journal": "Revista de Administra\u00e7\u00e3o de Empresas"}
+{"title": "A critical role for NMDA receptors in parvalbumin interneurons for gamma rhythm induction and behavior", "sha": "4870ef390d6cf8c51fe0f8bf7fecf47054d8db22", "authors": ["M Carl\u00e9n", "K Meletis", "J Siegle", "J Cardin", "K Futai", "D Vierling-Claassen", "C R\u00fchlmann", "S Jones", "K Deisseroth", "M Sheng", "C Moore", "L-H Tsai"], "doi": "10.1038/mp.2011.31", "journal": "Molecular Psychiatry"}
+{"title": "Availability of food resources, distribution of invasive species, and conservation of a Hawaiian bird along a gradient of elevation", "sha": "48b897d30ca56d03e85cd7ca59f86e47455a984e", "authors": ["Paul Banko", "Peter Oboyski", "John Slotterback", "Steven Dougill", "Daniel Goltz", "Luanne Johnson", "Megan Laut", "T Murray"], "doi": null, "journal": null}
+{"title": "The pedagogy of virtual design studios", "sha": "4900722ee595e86b617766ec4fd0ed97b0c4e07a", "authors": ["Thomas Kvan"], "doi": null, "journal": "Automation in Construction"}
+{"title": "The Electronic Spectrum of the UO2Molecule", "sha": "49d9d699be752f6013f7ec8596acf514145406bb", "authors": ["Laura Gagliardi", "Michael Heaven", "Jesper Krogh", "Bj\u00f6rn Roos"], "doi": "10.1021/ja044940l", "journal": "Journal of the American Chemical Society"}
+{"title": "Non-linear growth effects of financial development: Does financial integration matter? \u2660", "sha": "4a09c2015bd42d1e04825294652045f6479e61a2", "authors": ["Arjana Brezigar", "Masten Fabrizio Coricelli", "Igor Masten"], "doi": null, "journal": null}
+{"title": "A probabilistic method for identifying start codons in bacterial genomes", "sha": "4a12a8304b52bf99646e09f2ba6bbc9b51773dfe", "authors": ["Baris Suzek", "Maria Ermolaeva", "Mark Schreiber", "Steven Salzberg"], "doi": null, "journal": "BIOINFORMATICS"}
+{"title": "Computational Analogues of Entropy", "sha": "4a60bf62c998a6d0bf85dea13146162e7cad8e42", "authors": ["Boaz Barak", "Ronen Shaltiel", "Avi Wigderson"], "doi": null, "journal": null}
+{"title": "Extending the charge-flipping method towards structure solution from incomplete data sets", "sha": "4b070ffa07b142e827de8901c1dde8a504020ed6", "authors": ["Luk\u00e1\u0161 Palatinus", "Walter Steurer", "Gervais Chapuis"], "doi": "10.1107/s0021889807007637", "journal": "Journal of Applied Crystallography"}
+{"title": "Evolutionary game dynamics in finite populations with strong selection and weak mutation", "sha": "4b0f5028ce155fa72ffa80e46e1e086395a66236", "authors": ["Drew Fudenberg", "Martin Nowak", "Christine Taylor", "Lorens Imhof"], "doi": "10.1016/j.tpb.2006.07.006", "journal": "Theoretical Population Biology"}
+{"title": "Clinical Section Differential Effects of Severe Self-injurious Behaviour on the Behaviour of Others", "sha": "4b5d5a1d186b4b12f0f42a9c2c3d995cd3ee99c6", "authors": ["Scott Hall", "Chris Oliver"], "doi": null, "journal": "Behavioural Psychotherapy"}
+{"title": "Genetic Programming with Simple Loops", "sha": "4b828892a3960a2820aed706f502547d08bbef55", "authors": ["Yuesheng Qi", "Baozhong Wang", "A", "G Lishan"], "doi": null, "journal": null}
+{"title": "\u00ca\u00ca\u00ca\u00ca\u00ca\u00da \u00d6\u00d9\u00d9\u00d6\u00dd\u00b8\u00bd\u00bd\u00bd\u00bd\u00d6\u00d9\u00d9\u00d6\u00dd\u00b8\u00bd\u00bd\u00bd\u00bd \u00cf \u00d8\u00d8\u00d6\u00d1\u00d1\u00d2\u00d2 \u00d8\u00d8\u00d8 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00d3\u00d3 \u00d4\u00d3\u00d7\u00d7\u00d8\u00d8\u00da \u00d2\u00d2 \u00d2\u00d2\u00d2\u00d2\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d7 \u00d2 \u00d7\u00d3\u00d0\u00d9\u00d9\u00d0\u00d0 \u00d6\u00d6\u00d6\u00d2\u00b9\u00b9\u00d1\u00d1\u00d8\u00d8\u00d8\u00d2\u00d2 \u00d0\u00d8\u00d8\u00d6\u00d2\u00d2\u00d8\u00d8\u00d2\u00d2 \u00d0\u00d3\u00d3 \u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6 \u00db\u00db\u00d8\u00d8 \u00d1\u00d1\u00d8\u00d8\u00d3\u00dc\u00dd \u00b9\u00d7\u00d9\u00d9\u00d7\u00d8\u00d8\u00d8\u00d9\u00d8\u00d8\u00d8 \u00d3\u00d2\u00d2\u00d9\u00d9\u00d9\u00d8\u00d8\u00d8 \u00d7\u00d7\u00d7\u00d1\u00d1\u00d2\u00d8\u00ba \u00cc\u00cc\u00cc \u00d2\u00d2\u00d2\u00d2\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00d3\u00d3 \u00a2 \u00bd\u00bc \u00bd\u00bd \u00d1 \u00be \u00bb\u00ce\u00ba\u00d7 \u00d7 \u00d6\u00d6\u00d6\u00d8\u00d0\u00dd \u00d8\u00d8\u00d6\u00d1\u00d1\u00d2\u00d2\u00d2 \u00d9\u00d7\u00d7\u00d2\u00d2 \u00d7\u00d4\u00d4\u00d4\u00d4\u00b9\u00b9\u00d6\u00d6\u00d6\u00b9\u00d0\u00d0\u00d1\u00d1\u00d8\u00d8\u00d8 \u00d9\u00d6\u00d6\u00d6\u00d2\u00d8 \u00d8\u00d2\u00d2\u00d0\u00dd\u00d8\u00d8\u00d8\u00d8\u00d0 \u00d0\u00dc\u00d4\u00d6\u00d6\u00d7\u00d7\u00d7\u00d3\u00d2\u00d7\u00ba \u00c8\u00d3\u00d7\u00d7\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d6 \u00d8\u00d6\u00d6\u00d2\u00d7\u00d4\u00d3\u00d6\u00d8 \u00d8\u00d7 \u00d7\u00d0\u00d7\u00d3 \u00d7\u00d4\u00d4\u00d4\u00d4\u00b9\u00b9\u00d6\u00d6\u00d6\u00b9\u00d0\u00d0\u00d1\u00d1\u00d8\u00d8\u00d8\u00b8\u00db\u00db\u00d8\u00d8\u00d7\u00d4\u00d4\u00d4\u00d4\u00b9\u00b9\u00d6\u00d6\u00d6\u00b9\u00d0\u00d0\u00d1\u00d1\u00d8\u00d8\u00d8\u00b8\u00db\u00db\u00d8\u00d8 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00d3 \u00d3 \u00bd \u00a2 \u00bd\u00bc \u00d1 \u00be \u00bb\u00ce\u00ba\u00d7\u00ba \u00cc\u00cc\u00cc \u00cc\u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d2 \u00d8\u00d6\u00d6\u00d4 \u00d4\u00d4\u00d7\u00d8\u00d6\u00d6\u00d6\u00d9\u00d8\u00d8\u00d3\u00d2 \u00d2\u00d7 \u00d7\u00dc\u00d4\u00d3\u00d2\u00d2\u00d2\u00d8\u00d8\u00d8\u00d0\u00b8\u00db\u00db\u00d8\u00d8\u00d7\u00dc\u00d4\u00d3\u00d2\u00d2\u00d2\u00d8\u00d8\u00d8\u00d0\u00b8\u00db\u00db\u00d8\u00d8 \u00d8 \u00d8\u00d6\u00d6\u00d6\u00d8\u00d8\u00d6\u00d6\u00d7\u00d8\u00d8\u00d8 \u00d2\u00d2\u00d6\u00d6\u00dd \u00d3\u00d3 \u00bc\u00ba\u00bd\u00be \u00ce\u00ba \u00ba \u00ba\u00d3\u00d0\u00d0 \u00d8\u00d6\u00d6\u00d4 \u00db\u00db\u00d8\u00d8 \u00d8\u00d2\u00d2\u00d6\u00d6\u00dd \u00bc\u00ba\u00ba \u00ce \u00db\u00d7 \u00d3\u00d3\u00d7\u00d7\u00d6\u00da\u00ba \u00cc\u00cc\u00cc\u00d7 \u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6 \u00d7 \u00d9\u00d7\u00d7\u00d7 \u00d7\u00d7 \u00d1\u00d1\u00d7\u00d7\u00d7\u00da \u00d1\u00d1\u00d8\u00d8\u00d6\u00d6\u00d6\u00d0 \u00d0\u00d2 \u00d3\u00d6\u00d6\u00d6\u00d2\u00d2\u00d2 \u00d0\u00d0\u00d0\u00d0\u00d8\u00b9\u00b9\u00d1\u00d1\u00d8\u00d8\u00d8\u00d2\u00d2 \u00d2\u00d2\u00d3\u00d3\u00d3\u00d7 \u00d8\u00d8\u00d8\u00d8 \u00d4\u00d6\u00d6\u00d7\u00d7\u00d2\u00d8 \u00d8 \u00d6 \u00d6 \u00d6 \u00d6 \u00d8\u00d2\u00d2\u00d7\u00d7 \u00d3\u00d3 \u00bc\u00bc \u00bc\u00bc\u00bb\u00d1 \u00be \u00d8 \u00bd\u00be\u00ba\u00ba \u00ce\u00ba \u00c1 \u00c1\u00d2\u00d8\u00d6\u00d3\u00d3\u00d9\u00d9\u00d8\u00d8\u00d3\u00d2 \u00d7 \u00d6\u00d6\u00d6\u00d6\u00d2\u00d8\u00d0\u00dd \u00dd\u00d2\u00d2\u00d0\u00dd\u00d8\u00d8\u00d8\u00d8\u00d0\u00d0\u00dd \u00dd\u00dd\u00d1\u00d3\u00d2\u00d7\u00d8\u00d6\u00d6\u00d8\u00d8\u00d8\u00b8\u00d8\u00d8\u00d8\u00dd\u00dd\u00d1\u00d3\u00d2\u00d7\u00d8\u00d6\u00d6\u00d8\u00d8\u00d8\u00b8\u00d8\u00d8\u00d8 \u00d8\u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00b9 \u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00dd\u00d7 \u00d7 \u00d4\u00d4\u00d6\u00d6\u00d1\u00d1\u00d8\u00d8\u00d6 \u00d8\u00d8\u00d8\u00d8 \u00d4\u00d0\u00d0\u00dd\u00d7 \u00d7 \u00d7\u00d6\u00d9\u00d9\u00d9\u00d9\u00d0 \u00d6\u00d3\u00d0\u00d0 \u00d0\u00d2 \u00d8\u00d8\u00d8 \u00d4\u00d4\u00d6\u00d6\u00d3\u00d6\u00d1\u00d1\u00d2\u00d2\u00d2 \u00d3\u00d3 \u00d4\u00d6\u00d6\u00d7\u00d8\u00d8\u00d2\u00d2\u00b9\u00d7\u00d7\u00d1\u00d1\u00d1\u00d3\u00d2\u00d2\u00d9\u00d9\u00d8\u00d3\u00d6\u00b9 \u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6\u00d7\u00b9 \u00d7\u00d7\u00d7 \u00d0\u00d0\u00d0\u00d0\u00d8\u00b9\u00b9\u00d1\u00d1\u00d8\u00d8\u00d8\u00d2\u00d2 \u00da\u00da\u00da\u00da\u00d7\u00b4\u00c4\u00c4\u00c4\u00d7\u00b5\u00da\u00da\u00da\u00da\u00d7\u00b4\u00c4\u00c4\u00c4\u00d7\u00b5 \u00bd\u00bd\u00ba \u00cc\u00cc\u00cc \u00d7\u00d8\u00d6\u00d6\u00b9 \u00d9\u00d8\u00d8\u00d3\u00d2 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d8\u00d1\u00d1\u00d7\u00d7\u00d7\u00da \u00d6\u00d6\u00d6\u00d6\u00d3\u00d2 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d8\u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d0\u00d9\u00d1\u00d1\u00d2\u00d2\u00d7\u00d7\u00d7\u00d2\u00d8 \u00d1\u00d1\u00d8\u00d8\u00d6\u00d6\u00d6\u00d0 \u00d0\u00d2 \u00d2 \u00d8\u00d8\u00d8\u00d2 \u00ac\u00d0\u00d1 \u00da\u00da\u00da\u00da \u00da\u00d7 \u00d7\u00d7\u00d6\u00d6\u00d6\u00d8\u00d0\u00dd \u00dd\u00d3\u00d2\u00d8\u00d6\u00d3\u00d0\u00d0\u00d0\u00d0 \u00d0\u00dd \u00d8\u00d8\u00d8 \u00d8\u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00d8\u00d8\u00d7\u00ba \u00c1\u00d2 \u00d1\u00d3\u00d7\u00d8 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d8\u00d8\u00d7\u00d7\u00d7 \u00d8\u00d8\u00d8 \u00d4\u00d3\u00d7\u00d7\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00dd\u00d7 \u00d9\u00d4 \u00d8\u00d3 \u00d3 \u00d3\u00d3\u00db \u00d3\u00d6\u00d6\u00d6\u00d6\u00d7 \u00d3\u00d3 \u00d1\u00d1\u00d1\u00d2\u00d2\u00d8\u00d9\u00d9\u00d9 \u00d0\u00d0\u00d6\u00d6\u00d6\u00d6 \u00d8\u00d8\u00d8\u00d2 \u00d8\u00d8\u00d8 \u00d2\u00d2\u00d2\u00d2\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00b9 \u00d0\u00d0\u00d8\u00dd\u00b8\u00b8\u00d3\u00d2\u00ac\u00d2\u00d2\u00d2\u00d2\u00d0\u00d0\u00d8\u00dd\u00b8\u00b8\u00d3\u00d2\u00ac\u00d2\u00d2\u00d2\u00d2 \u00d8\u00d8\u00d8 \u00d0\u00d0\u00d0\u00d0\u00d8 \u00d8\u00d1\u00d1\u00d7\u00d7\u00d7\u00d3\u00d2 \u00d6\u00d6\u00d6\u00d6\u00d3\u00d2 \u00d8\u00d3 \u00d8\u00d8\u00d8 \u00d8\u00d8\u00d8\u00d8\u00d3\u00d3\u00d3 \u00d2\u00d2\u00d2\u00d2\u00d2\u00d3\u00d6\u00d6\u00d3\u00d3\u00d3 \u00be\u00be\u00ba \u00cf\u00cf\u00cf\u00cf\u00d6 \u00d7\u00d8\u00d6\u00d6\u00d6\u00d9\u00d8\u00d8\u00d3\u00d2\u00d7 \u00db\u00d3\u00d9\u00d0\u00d0 \u00d3\u00d3\u00b9 \u00d7\u00d7\u00d6\u00da \u00db\u00db\u00db\u00d2 \u00d8\u00d8\u00d8 \u00d6\u00d6\u00d8\u00d8\u00d3 \u00d3\u00d3\u00d8\u00db\u00d2 \u00d8\u00d8\u00d8 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00d3\u00d3 \u00d4\u00d3\u00d7\u00d7\u00d8\u00d8\u00da \u00d2\u00d2 \u00d2\u00d2\u00d2\u00d2\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d7 \u00d8\u00d8\u00d2\u00d2\u00d7 \u00d8\u00d3 \u00d3\u00d2\u00d2 \u00d2\u00bd\u00bd\u00ba \u00d3\u00d6 \u00c4\u00c4\u00c4 \u00c4\u00d4\u00d4\u00d0\u00d0\u00d0\u00d0\u00d8\u00d8\u00d3\u00d2\u00d7 \u00d7 \u00d7\u00d8\u00d6\u00d6\u00d2\u00d2\u00d8\u00d8 \u00d8\u00d3\u00d2\u00d8\u00d6\u00d3\u00d0 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d8\u00d1\u00d1\u00d7\u00b9 \u00d7\u00d7\u00d3\u00d2 \u00db\u00da\u00d0\u00d0\u00d2\u00d2\u00d8\u00d8 \u00d1\u00d1\u00dd \u00dd \u00dd \u00d2\u00d8\u00d8\u00d6\u00d6\u00d7\u00d8\u00d8\u00d2\u00d2\u00ba \u00cc\u00cc\u00cc \u00d9\u00d7\u00d7 \u00d3\u00d3 \u00d0\u00d3\u00d3 \u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6\u00d7 \u00d4\u00d4\u00d6\u00d1\u00d1\u00d8\u00d7 \u00d8\u00d3 \u00d3\u00d2\u00d8\u00d6\u00d3\u00d0 \u00d8\u00d8\u00d8 \u00d1\u00d1\u00d7\u00d7\u00d7\u00d3\u00d2 \u00d2 \u00db\u00dd \u00d7\u00d7\u00d1\u00d1\u00d0\u00d0\u00d6 \u00d8\u00d3 \u00d8\u00d8\u00d8 \u00da\u00d6\u00d6\u00d6\u00d8\u00d8\u00d3\u00d2 \u00d3\u00d3 \u00d3\u00d0\u00d0\u00d0\u00d3\u00d1\u00d1\u00d6 \u00d1\u00d3\u00d0\u00d0\u00d0\u00d9\u00d0\u00d0\u00d6 \u00d1\u00d1\u00d7\u00d7\u00b8\u00d9\u00d8 \u00d1\u00d1\u00d7\u00d7\u00b8\u00d9\u00d8 \u00db\u00db\u00d8\u00d8\u00d3\u00d9\u00d8 \u00d0\u00d3\u00d7\u00d7 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d9\u00d7\u00d7\u00d7\u00d9\u00d0 \u00d1\u00d1\u00d1\u00d2\u00d2\u00d2\u00d2\u00d0 \u00d4\u00d6\u00d3\u00d4\u00d4\u00d6\u00d8\u00d8\u00d8\u00d7 \u00d3\u00d3 \u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6\u00d7\u00ba \u00c1\u00d2 \u00d8\u00d8\u00d8\u00d7 \u00d7\u00d7 \u00d8\u00d8\u00d8 \u00d8\u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6\u00d7 \u00d7\u00d6\u00d6 \u00d6\u00d3\u00d2\u00d7\u00d8\u00d8\u00d8\u00d9\u00d8\u00d8\u00d8 \u00dd \u00d0\u00d8\u00d8\u00d6\u00d2\u00d2\u00d8\u00d8\u00d8 \u00d0\u00d3\u00d3\u00d7 \u00d3\u00d3 \u00d3\u00d2\u00d2\u00d9\u00d9\u00d9\u00d8\u00d8\u00d8\u00b4\u00b4\u00d6\u00d3\u00d1\u00d3\u00d4\u00d4\u00d3\u00d6\u00b5\u00d3\u00d2\u00d2\u00d9\u00d9\u00d9\u00d8\u00d8\u00d8\u00b4\u00b4\u00d6\u00d3\u00d1\u00d3\u00d4\u00d4\u00d3\u00d6\u00b5 \u00d2\u00d2 \u00d2\u00d3\u00d2\u00b9\u00b9\u00d3\u00d2\u00d2\u00d9\u00d9\u00d9\u00d8\u00d8\u00d8\u00b4\u00d7\u00d4\u00d4\u00d4\u00d4\u00d6\u00b5\u00d2\u00d3\u00d2\u00b9\u00b9\u00d3\u00d2\u00d2\u00d9\u00d9\u00d9\u00d8\u00d8\u00d8\u00b4\u00d7\u00d4\u00d4\u00d4\u00d4\u00d6\u00b5 \u00d7\u00d7\u00d7\u00d1\u00d1\u00d2\u00d8\u00d7\u00ba \u00cc\u00cc\u00cc \u00cc\u00d1\u00d1\u00d7\u00d7\u00d7\u00d3\u00d2 \u00d2\u00d3\u00d0\u00d3\u00d6 \u00d3\u00d2\u00d8\u00d6\u00d3\u00d0 \u00d0\u00d7 \u00da \u00dd \u00d8\u00d8\u00d8 \u00d3\u00d2\u00d8\u00d6\u00d3\u00d0 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d0\u00d0\u00d2\u00d2\u00d8\u00d8 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d3\u00d2\u00d2\u00d9\u00d9\u00d9\u00d8\u00d8\u00d8 \u00d7\u00d7\u00d7\u00d1\u00d1\u00d2\u00d8 \u00d8\u00bf\u00b8\u00b8\u00b8\u00ba \u00cb\u00d3\u00d1\u00d1 \u00d3\u00d3 \u00d8\u00d8\u00d8\u00d7\u00d7 \u00d7\u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d0\u00d9\u00d1\u00d1\u00d2\u00d2\u00d7\u00d7\u00d7\u00d2\u00d8 \u00d8\u00d0\u00d8\u00d8\u00d6\u00d2\u00d2\u00d8\u00d8\u00d2\u00d2 \u00d2\u00d0\u00d3\u00d3 \u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6\u00d7 \u00d4\u00d6\u00d6\u00d7\u00d7\u00d2\u00d8 \u00d4\u00d4\u00d3\u00d8\u00d3\u00d0\u00d9\u00d1\u00d1\u00d2\u00d2\u00d7\u00d7\u00d7\u00d2\u00d2\u00d2 AEAEAEAE\u00d2\u00d2\u00dd \u00d2 \u00d8\u00d8\u00d8 \u00d7\u00d3\u00d0\u00d0\u00d0 \u00d4\u00d4\u00d4\u00d7\u00d7\u00b8\u00db\u00db\u00db\u00db\u00d4\u00d4\u00d4\u00d7\u00d7\u00b8\u00db\u00db\u00db\u00db \u00db \u00d7 \u00d7\u00d8\u00d8\u00d6\u00d6\u00d6\u00d9\u00d8\u00d8\u00d8 \u00d8\u00d3 \u00d3\u00d6\u00d6\u00d6\u00d8\u00d8\u00d6 \u00d6\u00dc\u00b9 \u00d8\u00d3\u00d2\u00d2\u00d2 \u00d2\u00d3\u00d2\u00ac\u00d2\u00d2\u00d1\u00d1\u00d2\u00d8 \u00d4 \u00d6 \u00d3 \u00da\u00da\u00da\u00da\u00da \u00da\u00dd \u00d8\u00d8\u00d8 \u00d8\u00d0\u00d3\u00d3 \u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6 \u00d6\u00dd \u00da\u00da\u00d6\u00d8\u00d9\u00d9 \u00d3\u00d3 \u00d4\u00d4\u00d4\u00d7\u00d7 \u00d7\u00d7\u00d7\u00d6\u00d6\u00d6\u00d6\u00d8\u00d8\u00d8 \u00d1\u00d3\u00d6\u00d4\u00d4\u00d3\u00d0\u00d3\u00d3\u00dd \u00dd\u00dd\u00dd\u00ba \u00c7\u00d2\u00d2 \u00d3\u00d9\u00d0\u00d0 \u00dc\u00d4\u00d4\u00d4\u00d8 \u00d8\u00d8\u00d8\u00d8 \u00d8\u00d8\u00d8\u00d7 \u00d7\u00d6\u00d6\u00d6\u00d8\u00d8\u00d6\u00d6\u00d7\u00d8\u00d8\u00d8 \u00d8\u00d7\u00d7\u00d3\u00d3\u00d3\u00d3\u00d8\u00d8\u00d8 \u00d8\u00d3 \u00d3 \u00db\u00db\u00db\u00db \u00d6\u00d6\u00b9 \u00d3\u00d1\u00d2\u00d2\u00d8\u00d8\u00d3\u00d2 \u00de\u00d3\u00d2\u00d2 \u00d2\u00d2\u00d7\u00d7\u00d7\u00d7 \u00d8\u00d8\u00d8 \u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6 \u00d0\u00d0\u00dd\u00d6 \u00d3\u00d3 \u00d3 \u00c4\u00c4\u00c4 \u00c4\u00d0\u00d7\u00d3 \u00db\u00d3\u00d9\u00d0\u00d0 \u00d0\u00d1\u00d4\u00d6\u00d3\u00da \u00d8\u00d8\u00d8 \u00d8\u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d0\u00d9\u00d1\u00d1\u00d2\u00d2\u00d7\u00d7\u00d7\u00d2\u00d8 \u00d8AEAEAEAE\u00d2\u00d2\u00dd \u00dd\u00d9\u00d9 \u00d8\u00d3 \u00d6\u00d6\u00d6\u00d9\u00d9\u00d9\u00d9 \u00d9\u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d3\u00d3 \u00d5\u00d9\u00d9\u00d2\u00d2\u00d2\u00d2 \u00d3\u00d3 \u00d3\u00dc\u00dc\u00dc\u00d8\u00d3\u00d2\u00d7\u00ba \u00cc\u00d3 \u00d3 \u00d3 \u00da \u00d8\u00d8\u00d8 \u00d3\u00d2\u00d8\u00d6\u00d3\u00d0 \u00d8\u00d8\u00d8 \u00d8\u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00dd \u00dd \u00d1\u00d3\u00d0\u00d0\u00d0\u00d9\u00d0\u00d0\u00d6 \u00d6\u00d6\u00d7\u00d7\u00d7\u00d2 \u00d7 \u00d2\u00d2\u00d2\u00d2\u00d7\u00d7\u00d7\u00d6\u00dd \u00d8\u00d3 \u00d9\u00d2\u00d2\u00d2\u00d6\u00d7\u00d8\u00d8\u00d2\u00d2 \u00d3\u00db \u00d8\u00d8\u00d8 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00dd \u00d7 \u00d2\u00ad\u00d9\u00b9 \u00d2\u00d2\u00d2\u00d2 \u00d2\u00dd \u00d7 \u00d7 \u00da \u00d6\u00d6\u00d0 \u00d3\u00d3 \u00d8\u00d8\u00d8 \u00d8\u00d3\u00d4\u00d3\u00d0\u00dd\u00d1\u00d1\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d8\u00d8\u00d6\u00d6\u00d7\u00d8\u00d8\u00d8\u00d7 \u00d7\u00d9\u00d9 \u00d7 \u00d7\u00d6\u00d3\u00d1\u00d3\u00d4\u00d4\u00d3\u00d6\u00d6 \u00d0\u00d0\u00d2\u00d2\u00d8\u00d8\u00b8\u00d7\u00d4\u00d4\u00d4\u00d4\u00d0\u00d0\u00d2\u00d2\u00d8\u00d8\u00b8\u00d7\u00d4\u00d4\u00d4\u00d4 \u00d0\u00d0\u00d2\u00d2\u00d8\u00d8 \u00d8\u00d2\u00d2 \u00d7\u00d9\u00d9\u00d7\u00d8\u00d8\u00d8\u00d9\u00d8\u00d8\u00d3\u00d2\u00ba \u00cc\u00cc\u00cc \u00d4\u00d3\u00d7\u00d7\u00d8\u00d8\u00da \u00d6\u00d6\u00d6 \u00d6\u00d6\u00d6\u00d6\u00d6 \u00d1\u00d3\u00d3\u00d3\u00d0\u00d0\u00d8\u00d8\u00d8\u00d7 \u00d3\u00d3\u00d7\u00d7\u00d6\u00da \u00d2 \u00d7\u00d7\u00d6\u00d6\u00d6\u00d7 \u00d3\u00d3 \u00d3\u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d0\u00d9\u00d1\u00d1\u00d2\u00d2\u00d7\u00d7\u00d7\u00d2\u00d8 \u00d8\u00d0\u00d8\u00d8\u00d6\u00d2\u00d2\u00d8\u00d8\u00d2\u00d2 \u00d2\u00d0\u00d3\u00d3 \u00d3\u00d4\u00d3\u00d0\u00dd\u00b9 \u00d1\u00d1\u00d6\u00d7 \u00d7\u00d7\u00b9\u00b9\u00b9 \u00b9\u00d7 \u00d7\u00d3\u00d2\u00d7\u00d7\u00d7\u00d8\u00d8\u00d2\u00d8 \u00db \u00db \u00d8 \u00d8 \u00d8\u00d8\u00d8 \u00d8 \u00d8\u00d8\u00d8\u00d8 \u00d6\u00d6\u00d6\u00d9\u00d9\u00d8\u00d8\u00d3\u00d2 \u00d2 \u00d8\u00d8\u00d8 \u00d7\u00d4\u00d4\u00d4\u00d4\u00d6 \u00d0\u00d0\u00d2\u00d2\u00d8\u00d8 \u00d1\u00d4\u00d0\u00d0\u00d0\u00d7 \u00d2 \u00d6\u00d6\u00d6\u00d9\u00d9\u00d8\u00d8\u00d3\u00d2 \u00d2 \u00d8\u00d8\u00d8 \u00da\u00b9 \u00a3 \u00d0\u00d0\u00d0\u00d8\u00d6\u00d3\u00d2\u00d2\u00d2 \u00d1\u00d1\u00d1\u00d0\u00d0 \u00d0\u00d0\u00d0\u00d0\u00ac\u00d7\u00d7\u00d7\u00d7\u00ba\u00d9\u00d9\u00d4\u00d6\u00ba\u00ba\u00d6 \u00dd \u00c8\u00d6\u00d6\u00d7\u00d7\u00d2\u00d8 \u00d8\u00d8\u00d8\u00d6\u00d6\u00d7\u00d7\u00d7 \u00d7\u00d7\u00d4\u00d4\u00d6\u00d8\u00d1\u00d1\u00d2\u00d8 \u00d3 \u00d3 \u00c8 \u00c8 \u00dd\u00d7\u00d7\u00d7\u00d7\u00b8\u00cc\u00cc\u00cc\u00dd\u00d7\u00d7\u00d7\u00d7\u00b8\u00cc\u00cc\u00cc \u00c7\u00c7\u00c7\u00d3 \u00cb\u00d8\u00d8\u00d8\u00d8 \u00cd\u00d2\u00d2\u00da\u00d6\u00d7\u00d7\u00d8\u00dd\u00b8\u00b8\u00d3\u00d0\u00d9\u00d1\u00d9\u00d7\u00b8\u00cd\u00d2\u00d2\u00da\u00d6\u00d7\u00d7\u00d8\u00dd\u00b8\u00b8\u00d3\u00d0\u00d9\u00d1\u00d9\u00d7\u00b8 \u00c7\u00c7\u00c7\u00d3 \u00d3\u00bf\u00be\u00bd\u00bc\u00b9\u00bd\u00bd\u00bc\u00bc\u00b8\u00cd\u00cb\u00cb\u00ba\u00d3\u00bf\u00be\u00bd\u00bc\u00b9\u00bd\u00bd\u00bc\u00bc\u00b8\u00cd\u00cb\u00cb\u00ba \u00de \u00c8\u00d6\u00d6\u00d7\u00d7\u00d2\u00d8 \u00d8\u00d8\u00d8\u00d6\u00d6\u00d7\u00d7\u00d7 \u00d7\u00d7\u00d2\u00d8\u00d8\u00d6 \u00d6\u00d3\u00d6 \u00d6\u00d6\u00d7\u00d4\u00d0\u00d0\u00dd \u00cc \u00d2\u00d3\u00d0\u00d3\u00d3\u00dd \u00b2 \u00c5\u00c5\u00d2\u00d9\u00d9\u00d9\u00d9\u00d8\u00d9\u00d6\u00d6\u00d2\u00d2\u00b8\u00cd\u00d2\u00d2\u00da\u00d6\u00d7\u00d7\u00d8\u00dd\u00c5\u00c5\u00d2\u00d9\u00d9\u00d9\u00d9\u00d8\u00d9\u00d6\u00d6\u00d2\u00d2\u00b8\u00cd\u00d2\u00d2\u00da\u00d6\u00d7\u00d7\u00d8\u00dd \u00d3\u00d3 \u00c5\u00c5\u00c5\u00d2\u00b8\u00b8\u00d2\u00d2\u00c5\u00c5\u00c5\u00d2\u00b8\u00b8\u00d2\u00d2 \u00d2\u00d6\u00d6\u00d3\u00d6\u00b8\u00c5\u00c1\u00d2\u00d6\u00d6\u00d3\u00d6\u00b8\u00c5\u00c1 \u00c1\u00c1\u00bd\u00bc\u00bc\u00b9\u00be\u00bd\u00bc\u00bc\u00b8\u00cd\u00cb\u00cb\u00ba\u00c1\u00c1\u00bd\u00bc\u00bc\u00b9\u00be\u00bd\u00bc\u00bc\u00b8\u00cd\u00cb\u00cb\u00ba", "sha": "4bf7894bd7d6e0e55cfe71ba97c718e936da737b", "authors": [], "doi": null, "journal": null}
+{"title": "ON A LITTLEWOOD-PALEY TYPE INEQUALITY", "sha": "4c072ef1b71da7def7bf19056aca9259bf6c3132", "authors": [], "doi": null, "journal": null}
+{"title": "Advanced Numerical Simulations of Temperature Anisotropy Instabilities and Collective Interaction Processes in High-Intensity Bunched Ion Beams", "sha": "4c5c4cca2cf0f8891b891a4ec4c4d03d89af2623", "authors": ["Hong Qin", "Ronald Davidson", "Edward Startsev"], "doi": null, "journal": null}
+{"title": "Basis-free representations for the stress rate of isotopic materials", "sha": "4cd92ed9c57264a2571ed4f543a9ae1192f498dc", "authors": ["Guansuo Dui", "Yi-Chao Chen"], "doi": "10.1016/j.ijsolstr.2004.03.003", "journal": "International Journal of Solids and Structures"}
+{"title": "Health Disparities: A Rural-Urban Chartbook At the Heart of Health Policy", "sha": "4d151aec2dd927134bb0f52628cf76b986f4cff1", "authors": ["Rural South", "Carolina Rural", "Health South"], "doi": null, "journal": null}
+{"title": "INFLUENCE OF INTESTINAL SURFACTANT LIKE PARTICLES ON DIFFERENTIAL ACTIVATION OF SECONDARY SIGNALING MOLECULES DURING SALMONELLA TYPHIMURIUM INFECTION", "sha": "4d1e99b62b85d3767d369797d40efdff5e2ccf36", "authors": ["M Sofi", "Archana Bhatnagar", "Saveeta Sapra", "Akhtar Mahmood", "Sidhartha Majumdar"], "doi": "10.1111/j.1745-4565.2010.00219.x", "journal": "Journal of Food Safety"}
+{"title": "IR DISSOCIATION OF AMMONIA CLUS'IERS", "sha": "4d68c35017144b932f88635cb55faa84e898c468", "authors": ["M Snels", "R Fantoni &apos;", "R Sanders", "W Meerts"], "doi": null, "journal": "Chemical Physics"}
+{"title": null, "sha": "4dc949cc33ae646b66ed35bfe93b04127791b5d8", "authors": [], "doi": null, "journal": null}
+{"title": "NMR Structure of an Archaeal Homologue of Ribonuclease P Protein Rpp29\u2020", "sha": "4e648ac197a914d7c33f0717caf6775aab208df7", "authors": ["David Sidote", "David Hoffman"], "doi": "10.1021/bi030170z", "journal": "Biochemistry"}
+{"title": "Estudo dos vest\u00edgios de peixes dos s\u00edtios arqueol\u00f3gicos da \u00e1rea de influ\u00eancia da Usina Hidrel\u00e9trica Machadinho, Rio Grande do Sul, Brasil", "sha": "4e69fdb25d5beaa5e0f7276c26ee1188ce7fecc4", "authors": ["Cl\u00e1udio Ricken", "Luiz Malabarba"], "doi": "10.1590/s1984-46702009005000003", "journal": "Zoologia (Curitiba)"}
+{"title": "Per-band Link Control Transients Protection in Distributed Fiber Raman Amplifier Cascades", "sha": "4e86572fe35d91529d210b7878de2a90a2c45a26", "authors": ["Victor Pincheira", "Marcelo Soto", "Ricardo Olivares"], "doi": null, "journal": null}
+{"title": "Polymer-Free Biolimus A9-Coated Stent Demonstrates More Sustained Intimal Inhibition, Improved Healing, and Reduced Inflammation Compared With a Polymer-Coated Sirolimus-Eluting Cypher Stent in a Porcine Model", "sha": "4e910ab15f305696e77817254d3e28665d1d8822", "authors": ["N Tada", "R Virmani", "G Grant", "L Bartlett", "A Black", "C Clavijo", "U Christians", "R Betts", "D Savage", "S Su", "J Shulze", "S Kar"], "doi": "10.1161/circinterventions.109.877522", "journal": "Circulation: Cardiovascular Interventions"}
+{"title": "Adaptive Demodulation Performance over a Rayleigh Fading Channel", "sha": "4e9ddb0f161fbe6e4c111d2bda6b444ecdc2bb23", "authors": ["J Brown", "Konstantinos Plataniotis", "Subbarayan Pasupathy"], "doi": null, "journal": null}
+{"title": "Aeroelastic flutter of feathers, flight and the evolution of non-vocal communication in birds", "sha": "4ea1915c88c66d971f2e525a37a780e54930af18", "authors": ["C Clark", "R Prum"], "doi": "10.1242/jeb.126458", "journal": "Journal of Experimental Biology"}
+{"title": "AdaptSPEC: Adaptive Spectral Estimation for Nonstationary Time Series", "sha": "4ea1c372db36312970ad26be40bd9abffb45e3da", "authors": ["Ori Rosen", "Sally Wood", "David Stoffer"], "doi": null, "journal": null}
+{"title": "Taxa metab\u00f3lica de repouso e composi\u00e7\u00e3o corporal em mulheres na p\u00f3s-menopausa Resting metabolic rate and body composition in postmenopausal women", "sha": "4ecc09d70c152bc695be800f7cc02ceaab038f51", "authors": ["Val\u00e9ria Bonganha", "Miguel Soares Concei\u00e7\u00e3o", "Claudinei Ferreira", "Dos Santos", "Mara Patr\u00edcia", "Tra\u00edna Chacon-Mikahil", "Vera Madruga"], "doi": null, "journal": "Arq Bras Endocrinol Metab"}
+{"title": "Molecular Manipulation of G-Protein-coupled Receptors: A New Avenue into Drug Discovery", "sha": "4ed658a94515ab988e31999eed9a0b75f0488acf", "authors": ["M Sautel", "G Milligan"], "doi": null, "journal": null}
+{"title": null, "sha": "4ee5c296fa1e1e520e7706bbd463490efa093fb0", "authors": ["Sotero Mengue", "Eloir Schenkel", "Maria Schmidt", "Bruce Duncan"], "doi": null, "journal": null}
+{"title": null, "sha": "4f1b82447384c783c7472679b1c992968eac215f", "authors": ["U Jp", "Atsushi Sakai", "Hiroko Yoshimura"], "doi": null, "journal": null}
+{"title": null, "sha": "4fa3a9c254c962d46f87acd7638f5ce3496451f7", "authors": ["Johannes Brumm", "Felix Kubler", "Michael Grill", "Karl Schmedders", "Deutsche Bundesbank"], "doi": null, "journal": null}
+{"title": "Layer-by-layer self-assembly strategy for template synthesis of nanoscale devices", "sha": "4fefc1aea0f872fd50e7908d5e1611b9e04bdb6f", "authors": ["N Kovtyukhova", "B Martin", "J Mbindyo", "T Mallouk", "M Cabassi", "T Mayer"], "doi": null, "journal": null}
+{"title": "Transductive Malware Label Propagation: Find Your Lineage From Your Neighbors", "sha": "5021090a00667faffd8d2816177d59fe7650c013", "authors": ["Deguang Kong", "Guanhua Yan"], "doi": null, "journal": null}
+{"title": "Is there any evidence to advocate SUI prevention in continent women undergoing prolapse repair? An overview", "sha": "5052680d334decd0e0dce66fe18b74297495a50a", "authors": ["B Fatton"], "doi": "10.1007/s00192-008-0734-4", "journal": "International Urogynecology Journal"}
+{"title": "Compact Routing Schemes for Generalised Chordal Graphs", "sha": "50bcd025aa92bf20642fdb4e969ce2ed15a63898", "authors": ["Yon Dourisboure"], "doi": null, "journal": "Journal of Graph Algorithms and Applications"}
+{"title": "Distributional impacts of the Self-Sufficiency Project", "sha": "50d82d1a300d1a8a86fdbf5aa5069b1be750e1fd", "authors": ["Marianne Bitler", "Jonah Gelbach", "Hilary Hoynes"], "doi": "10.1016/j.jpubeco.2007.07.001", "journal": "Journal of Public Economics"}
+{"title": "Visual Formal Specification using (N)TLCharts: Statechart Automata with Temporal Logic and Natural Language Conditioned Transitions", "sha": "513ed364e8b2e27b6ede8acc6aaa6e685e04cc6c", "authors": ["Doron Drusinsky"], "doi": null, "journal": null}
+{"title": "A new model for the study of rain-wind-induced vibrations of a simple oscillator", "sha": "515513a8d7e9b74306a66ce79b1b2f8f4f777caa", "authors": ["A Van Der Burgh", "Hartono", "A Abramian"], "doi": "10.1016/j.ijnonlinmec.2005.04.005", "journal": "International Journal of Non-Linear Mechanics"}
+{"title": "Assessment of turbulence model performance: Severe acceleration with large integral length scales", "sha": "5180a903533f026d72147743ac3915af6028dd46", "authors": ["Xiaoyu Yang", "Paul Tucker"], "doi": "10.1016/j.compfluid.2015.12.007", "journal": "Computers & Fluids"}
+{"title": "Detection of Colon Cancer Metastases With Fluorescence Laparoscopy in Orthotopic Nude Mouse Models", "sha": "521db80c5f649d11d700b02774a2bdffde99d92c", "authors": ["Rhiana Menen", "Sharmeela Kaushal", "Cynthia Snyder", "Mark Talamini", "Robert Hoffman", "Michael Bouvet"], "doi": "10.1001/archsurg.2012.704", "journal": "Archives of Surgery"}
+{"title": "Novel docetaxel-loaded nanoparticles based on PCL-Tween 80 copolymer for cancer treatment", "sha": "52a33583d87c2dc19f47ed822012dad6ccc7c62e", "authors": ["Lin Mei", "Zeng", "Cheng", "Zheng", "Song", "Huang", "Ma"], "doi": "10.2147/ijn.s25251", "journal": "International Journal of Nanomedicine"}
+{"title": "1751. 3) D.A.L. Paul", "sha": "52af2ac5d98e47e8707db0a456aaa259ceee1d66", "authors": ["S Tao", "J Bell", "J Green ; W", "G Falk", "E Jones ; P", "R Osmon ; W", "P Falk", "G Orth", "Jones"], "doi": null, "journal": "P. E. Osmon (unpublished)"}
+{"title": "Robust System Multiangulation Using Subspace Methods", "sha": "534c4d9a189c1c35ff412a1f4d36ef19e08ee2c8", "authors": ["Joshua Ash", "Lee Potter"], "doi": null, "journal": null}
+{"title": "The Rough Set Exploration System", "sha": "535fafcf004011cb6d88f3ff7714c5b5a81c4319", "authors": ["Jan Bazan", "Marcin Szczuka"], "doi": null, "journal": null}
+{"title": "Possible Microwave Absorption by H2S Gas in Uranus' and Neptune's Atmospheres", "sha": "53ae41789a3664b57548561f8bcacf3a50f6cd95", "authors": ["Imke De Pater", "Paul Romani", "Sushil Atreya"], "doi": null, "journal": "ICARUS"}
+{"title": "Integral Sliding-Based Robust Control", "sha": "53b8aa13cc1854ead919b8019148d8c79443ed52", "authors": ["Chieh-Chuan Feng"], "doi": null, "journal": null}
+{"title": "Nonlinear Control of PVTOL Vehicles subjected to Drag and Lift", "sha": "53d8cff786a0b02a6ff86c11cd5512fbf538eacd", "authors": ["Daniele Pucci", "Tarek Hamel", "Pascal Morin", "Claude Samson"], "doi": null, "journal": null}
+{"title": "lobChIP: from cells to sequencing ready ChIP libraries in a single day", "sha": "54072a34e47141cc8e3efddfa0777165eb6ca155", "authors": ["Ola Wallerman", "Helena Nord", "Madhusudhan Bysani", "Lisa Borghini", "Claes Wadelius"], "doi": "10.1186/s13072-015-0017-5", "journal": "Epigenetics & Chromatin"}
+{"title": "INFORMS-New Orleans 2005 Branch and Tree Decomposition Techniques for Discrete Optimization", "sha": "5424fdf309d72ee1824291243bf4115db6ea5b13", "authors": ["Illya Hicks", "Arie Koster", "Elif Koloto\u02d8"], "doi": "10.1287/educ.1053.0000", "journal": null}
+{"title": "Health Risks from Lead-Based Ammunition in the Environment", "sha": "549effe64f6babe6155455d08b8c1d56e96c70ef", "authors": ["David Bellinger", "Joanna Burger", "Tom Cade", "Deborah Cory-Slechta", "Myra Finkelstein", "Howard Hu", "Michael Kosnett", "Philip Landrigan", "Bruce Lanphear", "Mark Pokras", "Patrick Redig", "Bruce Rideout", "Ellen Silbergeld", "Robert Wright", "Donald Smith"], "doi": "10.1289/ehp.1306945", "journal": "Environmental Health Perspectives"}
+{"title": "The effects of serum lipids on the in vitro activity of lumefantrine and atovaquone against Plasmodium falciparum", "sha": "5516d3787bb7276419485eafaeacb1f90e7cb92f", "authors": ["Kesinee Chotivanich", "Mathirut Mungthin", "Ronnatrai Ruengweerayuth", "Rachanee Udomsangpetch", "Arjen Dondorp", "Pratap Singhasivanon", "Sasithon Pukrittayakamee", "Nicholas White"], "doi": null, "journal": null}
+{"title": "The Generic Consensus Service", "sha": "552369f8cbf567091c21529663e8f9b51c2951ce", "authors": ["Rachid Guerraoui", "Andre Schiper"], "doi": null, "journal": null}
+{"title": "Alternative tissue engineering scaffolds based on starch: processing methodologies, morphology, degradation and mechanical properties", "sha": "556624c5875da583d687fc682b732dd35fb998e7", "authors": ["M Gomes", "J Godinho", "D Tchalamov", "A Cunha", "R Reis"], "doi": null, "journal": null}
+{"title": "Optimum Linear Transmitter Design for MIMO Systems with Two QPSK Data Streams", "sha": "5593fe4e2eba0542372a1a998f59aa59c85a8dc5", "authors": ["Miquel Payar\u00f3", "Antonio Pascual-Iserte", "Migue\u00ed Lagunas"], "doi": null, "journal": null}
+{"title": "Large-Eddy Simulations of a Drizzling, Stratocumulus-Topped Marine Boundary Layer", "sha": "5599b16f786f7966233f70dd603957dba09c326b", "authors": ["Andrew Ackerman", "Margreet Vanzanten", "Bjorn Stevens", "Verica Savic-Jovcic", "Christopher Bretherton", "Andreas Chlond", "Jean-Christophe Golaz", "Hongli Jiang", "Marat Khairoutdinov", "Steven Krueger", "David Lewellen", "Adrian Lock", "Chin-Hoh Moeng", "Kozo Nakamura", "Markus Petters", "Jefferson Snider", "Sonja Weinbrecht", "Mike Zulauf"], "doi": "10.1175/2008mwr2582.1", "journal": "Monthly Weather Review"}
+{"title": "HARDWARE AND ALGORITHMS FOR ULTRASONIC DEPTH IMAGING", "sha": "55f4b11b7c07b6f44c2ce30e7b20e0ee450e3d45", "authors": ["Ivan Dokmani\u00b4c", "Dokmani\u00b4c", "Ivan Tashev"], "doi": null, "journal": null}
+{"title": "Interactions of ciprofloxacin with DPPC and DPPG: Fluorescence anisotropy, ATR-FTIR and 31P NMR spectroscopies and conformational analysis", "sha": "55f8fe646be2bf604b4f4dbf8a7f436c00a9c651", "authors": ["Hayet Bensikaddour", "Karim Snoussi", "Laurence Lins", "Fran\u00e7oise Van Bambeke", "Paul Tulkens", "Robert Brasseur", "Erik Goormaghtigh", "Marie-Paule Mingeot-Leclercq"], "doi": "10.1016/j.bbamem.2008.08.015", "journal": "Biochimica et Biophysica Acta (BBA) - Biomembranes"}
+{"title": "Publication P1 Designing Multiplicative General Parameter Filters Using Adaptive Genetic Algorithms", "sha": "56ad4991aad40daa5c6da61d9788337b6c844ba9", "authors": ["J Martikainen", "S Ovaska", "Jarno Martikainen", "Seppo Ovaska"], "doi": null, "journal": "Proc. of the Genetic and Evolutionary Computation Conference"}
+{"title": "The Relationship between Brain Oscillations and BOLD Signal during Memory Formation: A Combined EEG-fMRI Study", "sha": "5719ade214d6548302cd50900195943d22dd50fa", "authors": ["S Hanslmayr", "G Volberg", "M Wimber", "M Raabe", "M Greenlee", "K Bauml"], "doi": "10.1523/jneurosci.3140-11.2011", "journal": "Journal of Neuroscience"}
+{"title": null, "sha": "57791e4ca32ed25b0396eb56a5650536f62fe68e", "authors": [], "doi": null, "journal": null}
+{"title": "Inhibitors of amino acids biosynthesis as antifungal agents", "sha": "577c2feee29191dfc31f143617b4d52e6343f076", "authors": ["Kamila Jastrz\u0119bowska", "Iwona Gabriel"], "doi": "10.1007/s00726-014-1873-1", "journal": "Amino Acids"}
+{"title": "Hessian eigenmaps: Locally linear embedding techniques for high-dimensional data", "sha": "57a66ac4a4e0a00d2cdee8711ce0a18b49e9f7a2", "authors": ["David Donoho", "Carrie Grimes"], "doi": null, "journal": null}
+{"title": "Wittgenstein on the Substance of the World", "sha": "57d0b602653ba779e270abb900a1de312736a80c", "authors": ["Ian Proops"], "doi": null, "journal": null}
+{"title": "A Service of zbw", "sha": "589ee83f559e2e497b14df7cbc5d4f2bf99dc01e", "authors": [], "doi": null, "journal": null}
+{"title": ". ABSTRACT", "sha": "58a818d85d618946b08f2a11733dfa58985c41b2", "authors": [], "doi": null, "journal": null}
+{"title": "Self-assembled nanoparticles based on linoleic-acid modified chitosan: Stability and adsorption of trypsin", "sha": "58eec93b38646d81d9bc1ee93380fde3137d16ca", "authors": ["C Liu", "X Chen", "H Park"], "doi": "10.1016/j.carbpol.2005.08.010", "journal": "Carbohydrate Polymers"}
+{"title": "CXCR3 expression is associated with poor survival in breast cancer and promotes metastasis in a murine model", "sha": "594bf256ba94c22235c739a758b4cee98df2050d", "authors": ["X Ma", "K Norsworthy", "N Kundu", "W Rodgers", "P Gimotty", "O Goloubeva", "M Lipsky", "Y Li", "D Holt", "A Fulton"], "doi": "10.1158/1535-7163.mct-08-0485", "journal": "Molecular Cancer Therapeutics"}
+{"title": "Resource Allocation with Non-deterministic Demands and Profits", "sha": "59a64bc4e20ea824471f152c509b1851525d8d59", "authors": ["Nan Hu", "Diego Pizzocaro", "Matthew Johnson", "Thomas Laporta", "Alun Preece"], "doi": "10.1109/mass.2013.61", "journal": "2013 IEEE 10th International Conference on Mobile Ad-Hoc and Sensor Systems"}
+{"title": "External World Skepticism", "sha": "59e77ae369baa052f602fb5ab9f23118f85142b4", "authors": ["John Greco"], "doi": "10.1111/j.1747-9991.2007.00090.x", "journal": "Philosophy Compass"}
+{"title": "Factors Affecting Circulating mRNA for Nephrin", "sha": "59e82bea1a8c5b58cd7521e22c7d91b4fc5ada78", "authors": ["Uela Orlandi", "Asif Butt", "David Goldsmith", "R Swami-Nathan"], "doi": "10.1373/clinchem.2005.053124", "journal": "Clinical Chemistry"}
+{"title": "Impacts of Alternative Timber Harvest Practices on Leaf-Chewing Herbivores of Oak", "sha": "59f9d6226b0762d27ef82c3767622b1febf3b910", "authors": ["Rebecca Forkner", "Robert Marquis", "John Lill", "Josiane Le Corff"], "doi": "10.1111/j.1523-1739.2006.00346.x", "journal": "Conservation Biology"}
+{"title": "The Boltzmann Equation for Driven Systems of Inelastic Soft Spheres", "sha": "5a0dc993f52025b61ee6ac86a94b9d523a83a94c", "authors": ["M Ernst", "E Trizac", "A Barrat"], "doi": "10.1007/s10955-006-9062-6", "journal": "Journal of Statistical Physics"}
+{"title": "DC Proposal: Model for News Filtering with Named Entities", "sha": "5b05ed96f8e433fa55d3d756145312358a9e6a67", "authors": ["Ivo La\u0161ek"], "doi": null, "journal": null}
+{"title": "Airborne hydrocarbon contamination from laboratory atmospheres", "sha": "5b27b0b77b47370cbfd776c063ba24eb027ccbb2", "authors": ["Christian Illing", "Christian Hallmann", "Kristen Miller", "Roger Summons", "Harald Strauss"], "doi": "10.1016/j.orggeochem.2014.07.006", "journal": "Organic Geochemistry"}
+{"title": "Performance and Processing of SAR Satellite Clusters", "sha": "5b4986a6b8cb315bee08d4c2a206e2ef18e14be5", "authors": ["Jim Stiles", "Nathan Goodman", "Sichung Lin"], "doi": null, "journal": null}
+{"title": "In vitro activity of tigecycline and comparators on Acinetobacter spp. isolates collected from patients with bacteremia and MIC change during the Tigecycline Evaluation and Surveillance Trial, 2004 to 2008", "sha": "5b94e4d237d2f4f5fd608cf0fa1534baaf7b7003", "authors": ["Yun Wang", "Michael Dowzicky"], "doi": "10.1016/j.diagmicrobio.2010.04.002", "journal": "Diagnostic Microbiology and Infectious Disease"}
+{"title": "284 | Research and Practice | Peer Reviewed | Cutrona et al. Results", "sha": "5b983fda31c588e62f9ae02546f882b047dc850d", "authors": ["Sarah Cutrona", "Steffie Woolhandler", "Karen Lasser", "David Bor", "Danny Mccormick", "David Himmelstein"], "doi": "10.2105/AJPH.2007.114249)", "journal": "American Journal of Public Health"}
+{"title": "Tracking of signal and its derivatives in Gaussian white noise", "sha": "5c4018f4eed9d4662af344dd95847e2e8d52b91f", "authors": ["P.-L Chow", "R Khasminskii", "R Liptser"], "doi": null, "journal": null}
+{"title": "Art-based Rendering with Continuous Levels of Detail", "sha": "5c8e4b7272d8fa191fbf753aeef3d5ea90938193", "authors": ["Lee Markosian", "Barbara Meier", "Michael Kowalski", "Loring Holden", "J Northrup", "John Hughes"], "doi": null, "journal": null}
+{"title": "A case of epithelioid hemangioendothelioma in the liver", "sha": "5c9b697566f3ef07cb11e62854f29df91b3eeb63", "authors": ["Chunrong Ye", "Wen Wang"], "doi": "10.1007/s11805-007-0147-z", "journal": "Chinese Journal of Clinical Oncology"}
+{"title": "Competition among Telecommunication Providers", "sha": "5cc25532b6df7989f3c1709edc8fcaff8aa8da59", "authors": ["Patrick Maill\u00e9", "Peter Reichl", "Bruno Tuffin"], "doi": null, "journal": "LNCS"}
+{"title": "Run-time adaptability of synchronization policies in concurrent object-oriented languages", "sha": "5d8550cc6479d6f5cf1ad37e25cc6e613f957d74", "authors": ["Fernando S\u00e1nchez", "Juan Hern\u00e1ndez", "Juan Murillo", "Enrique Pedraza"], "doi": null, "journal": null}
+{"title": "Web Scripts and Mediation Dialogues as a Quality Factor in the Interaction of the Deaf", "sha": "5d9d5292e9ead0df3f53f80feb644382e7cb016e", "authors": ["Aline Alves", "Simone Ferreira", "Viviane Veiga", "Ingrid Monteiro", "Denis Silveira", "Alberto Raposo"], "doi": "10.1016/j.procs.2014.02.019", "journal": "Procedia Computer Science"}
+{"title": "Cooperativity between calmodulin-binding sites in Kv7.2 channels", "sha": "5e9d2fe5c249597e962c4ed5403e7083ca5d3658", "authors": ["Alessandro Alaimo", "Araitz Alberdi", "Carolina Gomis-Perez", "Juncal Fern\u00e1ndez-Orth", "Juan G\u00f3mez-Posada", "Pilar Areso", "Alvaro Villarroel"], "doi": "10.1242/jcs.114082", "journal": "Journal of Cell Science"}
+{"title": "Knowledge Management and Organizational Learning", "sha": "5f20e07e05dae05bceaeac7b6d21d1d6bb725a8b", "authors": ["William King"], "doi": "10.1007/978-1-4419-0011-1_1", "journal": "Knowledge Management and Organizational Learning"}
+{"title": "In Vitro Monitoring of Surface Mechanical Properties of Poly(L-Lactic Acid) Using Microhardness", "sha": "5f8df4a3ea809b3a0ba740a6f1af43b34cf1a221", "authors": ["C Saiz-Arroyo", "Y Wang", "M Rodriguez-Perez", "N Alves", "J Mano"], "doi": null, "journal": null}
+{"title": "Loop C and the mechanism of acetylcholine receptor\u2013channel gating", "sha": "5fd996c58de49d877f77f378b0415ffc2af5bc9b", "authors": ["Prasad Purohit", "Anthony Auerbach"], "doi": "10.1085/jgp.201210946", "journal": "The Journal of General Physiology"}
+{"title": null, "sha": "60341a791652d9bd9c2348416b472ebfb8e85e48", "authors": ["N\u00eddia Raquel Costa", "Marcelo Andreotti", "M\u00e1ila Gioia", "Maria Aparecida", "Anselmo Tarsitano", "Cristiano Magalh\u00e3es Pariz", "Salati\u00e9r Buzetti"], "doi": null, "journal": null}
+{"title": "Intrinsic \u03b1-helical and \u03b2-sheet conformational preferences: A computational case study of alanine", "sha": "60568326f4c94acf58d79de0014bbe3334c83f63", "authors": ["Diego Caballero", "Jukka M\u00e4\u00e4tt\u00e4", "Alice Zhou", "Maria Sammalkorpi", "Corey O&apos;hern", "Lynne Regan"], "doi": "10.1002/pro.2481", "journal": "Protein Science"}
+{"title": "Adrenocorticotrophic hormone and dexamethasone failed to affect milk yield in dairy goats: comparative aspects", "sha": "60af0ad22c4af5f067529bddb2aaf567977eb49d", "authors": ["A Shamay", "S Mabjeesh", "F Shapiro", "N Silanikove"], "doi": null, "journal": null}
+{"title": "Supervised Learning of Hidden and Non-Hidden 0-order Affordances and Detection in Real Scenes", "sha": "60d304023269254fe2c96a8bac15ea47777bb679", "authors": ["Aitor Aldoma", "Federico Tombari", "Markus Vincze"], "doi": null, "journal": null}
+{"title": "Contract Representation for Run-time Monitoring and Enforcement", "sha": "610bc32e1f27ff489100d9f59e8a98759ec122b7", "authors": ["Carlos Molina-Jimenez", "Santosh Shrivastava", "Ellis Solaiman", "John Warne", "{carlos Molina", "Santosh Shrivastava", "Ellis Solaiman", "J Warne}@ncl", "Uk"], "doi": null, "journal": null}
+{"title": null, "sha": "611bb08536e765b384fd4d08f41cdddb45f3512a", "authors": [], "doi": null, "journal": null}
+{"title": "Opportunities and constraints for biochar technology in Australian agriculture: looking beyond carbon sequestration", "sha": "617ee9b0401e678f5670ede63e4f22fa08a604b5", "authors": ["Balwant Singh", "Lynne Macdonald", "Rai Kookana", "Lukas Van Zwieten", "Greg Butler", "Stephen Joseph", "Anthony Weatherley", "Bhawana Kaudal", "Andrew Regan", "Julie Cattle", "Feike Dijkstra", "Mark Boersma", "Stephen Kimber", "Alexander Keith", "Maryam Esfandbod"], "doi": "10.1071/sr14112", "journal": "Soil Research"}
+{"title": null, "sha": "61a85f3f2135b13ffd87faaf15bbac7cd9c77ac8", "authors": [], "doi": null, "journal": null}
+{"title": "Effects of Habitual Alcohol Intake on Ambulatory Blood Pressure, Heart Rate, and Its Variability Among Japanese Men", "sha": "61c5b1cd86740b82959b395328d4579466d7a9f6", "authors": ["T Ohira", "T Tanigawa", "M Tabata", "H Imano", "A Kitamura", "M Kiyama", "S Sato", "T Okamura", "R Cui", "K Koike", "T Shimamoto", "H Iso"], "doi": "10.1161/hypertensionaha.108.114835", "journal": "Hypertension"}
+{"title": "Working with the Media Resurrecting a Small Library Tips and Techniques: Promoting an Academic Re earch Libmry Creating a Library Fair", "sha": "61eabc30c559f9a693ba0a1233b4b4a260dcc1bc", "authors": ["Michele Russo", "N", "Stanley Campbell", "Etic Bartheld", "Associate Dit\u00b7ectot\u00b7", "Karen Evans"], "doi": null, "journal": null}
+{"title": null, "sha": "61ed495617feaa7f639853428d18e2749ff20d7d", "authors": [], "doi": null, "journal": null}
+{"title": "Ontology Design Pattern Property Specialisation Strategies", "sha": "620f9fbb15b8292170f5983c3453ea0f28ee0baf", "authors": ["Karl Hammar"], "doi": null, "journal": null}
+{"title": "GENDER IDEOLOGY, SAME-SEX PEER GROUP AFFILIATION AND THE RELATIONSHIP BETWEEN TESTOSTERONE AND DOMINANCE IN ADOLESCENT BOYS AND GIRLS", "sha": "6211db99bf25d5b0ce9c698e4bda8ea27cfb487c", "authors": ["Hans Vermeersch", "Guy T&apos;sjoen", "J Kaufman", "J Vincke", "Mieke Van Houtte"], "doi": "10.1017/s0021932010000106", "journal": "Journal of Biosocial Science"}
+{"title": "Motion Control of Electric Vehicles Based on Robust Lateral Tire Force Control Using Lateral Tire Force Sensors", "sha": "622215ad689221a9ad585969d3e33a14ccfe4caa", "authors": ["Nam Kanghyun", "Hiroshi Fujimoto", "Yoichi Hori"], "doi": null, "journal": null}
+{"title": "Activin Upregulation by NF- B Is Required to Maintain Mesenchymal Features of Cancer Stem-like Cells in Non-Small Cell Lung Cancer", "sha": "6292cf33eb729fd662a5f9d15094fcb95817dd3c", "authors": ["J Wamsley", "M Kumar", "D Allison", "S Clift", "C Holzknecht", "S Szymura", "S Hoang", "X Xu", "C Moskaluk", "D Jones", "S Bekiranov", "M Mayo"], "doi": "10.1158/0008-5472.can-13-2702", "journal": "Cancer Research"}
+{"title": "Are Patients With Schizophrenia Insensitive to Pain? A Reconsideration of the Question", "sha": "62e6b35c8bda9bdf87778536a077c658d3173c57", "authors": ["Olivier Bonnot", "George Anderson", "David Cohen", "Jean Willer", "Sylvie Tordjman"], "doi": null, "journal": null}
+{"title": "Attenuated Responses to Angiotensin II in Follitropin Receptor Knockout Mice, a Model of Menopause-Associated Hypertension", "sha": "6322ad9cbca7d007cddbfb5ef00f55c2407510db", "authors": ["D Javeshghani", "R Touyz", "M Sairam", "A Virdis", "M Neves", "E Schiffrin"], "doi": "10.1161/01.hyp.0000085331.22169.3f", "journal": "Hypertension"}
+{"title": "Maternal expectations and birth-related experiences: a survey of pregnant women of mixed parity from Calcutta, India", "sha": "6337954a902904fa37364d9061a04e5a95aed0ca", "authors": ["I Hug", "C Chattopadhyay", "G Mitra", "R Kar Mahapatra", "M Schneider"], "doi": "10.1016/j.ijoa.2007.10.004", "journal": "International Journal of Obstetric Anesthesia"}
+{"title": "Self-Amplifying Replicon RNA Vaccine Delivery to Dendritic Cells by Synthetic Nanoparticles", "sha": "63ffbbc9d28a38e19b28668611c8e75ed4b644d2", "authors": ["Kenneth Mccullough", "Panagiota Milona", "Lisa Thomann-Harwood", "Thomas D\u00e9moulins", "Pavlos Englezou", "Rolf Suter", "Nicolas Ruggli"], "doi": "10.3390/vaccines2040735", "journal": "Vaccines"}
+{"title": "Gabor Feature Space Diffusion via the Minimal Weighted Area Method", "sha": "64440112f9ca75867542f1152b465d993e45c5c6", "authors": ["Chen Sagiv", "Nir Sochen", "Yehoshua Zeevi"], "doi": null, "journal": null}
+{"title": "Toward understanding respiratory sinus arrhythmia: Relations to cardiac vagal tone, evolution and biobehavioral functions", "sha": "6468a1c4c35e36e4cbd4855bfc37e053c52f2e83", "authors": ["Paul Grossman", "Edwin Taylor"], "doi": "10.1016/j.biopsycho.2005.11.014", "journal": "Biological Psychology"}
+{"title": "Processing Constrained k-Closest Pairs Queries in Crime Databases", "sha": "64eb08c61ad0ddaaf756f21bcfebb3c988b7f22b", "authors": ["Shaojie Qiao", "Changjie Tang", "Huidong Jin", "Shucheng Dai", "Xingshu Chen", "Michael Chau", "Jian Hu"], "doi": "10.1007/978-1-4419-1325-8_4", "journal": "Annals of Information Systems"}
+{"title": "POVERTY AND LABOUR MARKET RESPONSE TO REFORMS IN UGANDA BY", "sha": "655926e4479140734e9e03b4a57b73cb4e2ad852", "authors": ["Francis Okurut", "Sarah Ssewanyana"], "doi": null, "journal": null}
+{"title": null, "sha": "65cf977a65e4ce01c3794bc6109e2a9c4f322a84", "authors": [], "doi": null, "journal": null}
+{"title": "Divergence of protein-coding capacity and regulation in the Bacillus cereus sensu lato group", "sha": "65efb268ab86d5786a9667fac7f4cdb11e6956d5", "authors": ["Inimary Toby", "Jonah Widmer", "David Dyer"], "doi": null, "journal": null}
+{"title": "EDITORIAL PROGRAMA DE GEOLOGIA E GEOF\u00b4ISICAGEOF\u00b4GEOF\u00b4ISICA MARINHA DO BRASIL", "sha": "67f3776f100a70bd3c39474053a6999f9f3df7ae", "authors": ["Sidney Mello"], "doi": null, "journal": "Revista Brasileira de Geof\u00edsica"}
+{"title": "The Future of Exogenous Surfactant Therapy", "sha": "68070afa193b9e0715a47ad538e4feb03aeee560", "authors": ["D Willson", "R Notter"], "doi": "10.4187/respcare.01306", "journal": "Respiratory Care"}
+{"title": "Identification and characterization of laccase-type multicopper oxidases involved in dye-decolorization by the fungus Leptosphaerulina sp.", "sha": "68ac5a3126ffa0815185431d54a67c28e7a73a6f", "authors": ["Ledys Copete", "Xiomara Chanag\u00e1", "Jorge Barriuso", "Mar\u00eda L\u00f3pez-Lucendo", "Mar\u00eda Mart\u00ednez", "Susana Camarero"], "doi": "10.1186/s12896-015-0192-2", "journal": "BMC Biotechnology"}
+{"title": "Longitudinal viscosity of two-dimensional Yukawa liquids", "sha": "68c42613e9c1f3a78481a1d97ebe191d2eecc93f", "authors": ["Yan Feng", "J Goree", "Bin Liu"], "doi": "10.1103/physreve.87.013106", "journal": "Physical Review E"}
+{"title": "Decision-making during gambling: an integration of cognitive and psychobiological approaches", "sha": "69bc1412ac0c502e9fb81c7439aefd48345b6e21", "authors": ["Luke Clark"], "doi": "10.1098/rstb.2009.0147", "journal": "Philosophical Transactions of the Royal Society B: Biological Sciences"}
+{"title": "Channel Equalization in Filter Bank Based Multicarrier Modulation for Wireless Communications", "sha": "69d6a74679be6259b2c0597b23a198d6d609ffa3", "authors": ["Tero Ihalainen", "Tobias Hidalgo Stitz", "Mika Rinne", "Markku Renfors"], "doi": "10.1155/2007/49389", "journal": "EURASIP Journal on Advances in Signal Processing"}
+{"title": "Single-shot spectral interferometry with chirped pulses", "sha": "6a3e7631623942c5ec0b967099d500164746f9fd", "authors": ["J.-P Geindre", "P Audebert", "S Rebibo", "J.-C Gauthier"], "doi": null, "journal": "OPTICS LETTERS"}
+{"title": null, "sha": "6b0f3aaf47f9d812d8e914f7916501d0fa975087", "authors": ["Katy Bomer", "Valentine Muyumba", "Christopher Mehrens", "Cunningham Libraty"], "doi": null, "journal": null}
+{"title": "Semiclassical evaluation of kinetic isotope effects in 13-atomic system Semiclassical evaluation of kinetic isotope effects in 13-atomic system", "sha": "6b1c71773bada90870332a23fd3b957339602535", "authors": ["M Kryvohuz", "R Marcus", "M Kryvohuz", "R Marcus"], "doi": null, "journal": "Additional information on J. Chem. Phys"}
+{"title": "Perception of Human Motion with Different Geometric Models", "sha": "6b2377dfa05150795e3e95099bcf66f772b1cc5e", "authors": ["Jessica Hodgins", "James O&apos;brien", "Jack Tumblin"], "doi": null, "journal": "IEEE TRANSACTIONS ON VISUALIZATION AND COMPUTER GRAPHICS"}
+{"title": null, "sha": "6ba0bce8b868f12f3cd31e9c3996dcff37348438", "authors": ["Costa Benetti", "A"], "doi": null, "journal": "Revista de Biolog\u00eda Tropical"}
+{"title": "SURFACE WATER WAVES DUE TO AN OSCILLATORY WAVEMAKER IN THE PRESENCE OF SURFACE TENSION", "sha": "6bc6721ef9b0a992add36ae549bce8f67e8001ec", "authors": ["B Mandal", "S Banerjea"], "doi": null, "journal": "Internat J. Math. & Math. Sci"}
+{"title": "Expectancies about Control over Health: Relationship to Desire for Control of Health Care", "sha": "6c078a94336c329f39493e83f27a74f8df873882", "authors": ["Kenneth Wallston", "Roberta Smith", "Joan King", "Patricia Forsberg", "Barbara Wallston", "Vivian Nagy"], "doi": null, "journal": "Personnlicy .and Sochl P~ycholory Bullcrin"}
+{"title": "Motor, not visual, encoding of potential reach targets", "sha": "6ca1b6676fb6de84436b00e9ad06d493d1d7fbb6", "authors": ["Brandie Stewart", "Jason Gallivan", "Lee Baugh", "J Flanagan"], "doi": "10.1016/j.cub.2014.08.046", "journal": "Current Biology"}
+{"title": "Graded cross-links for stronger nanomaterials", "sha": "6d17d8c0c99b0fac5d7a6719734f337d9e6b7c29", "authors": ["Nicola Pugno"], "doi": null, "journal": "NUMBER"}
+{"title": "On Redundant Multipath Operating System Support for Wireless Mesh Networks", "sha": "6d30758efb1dddbd0780e4dac60e0fa7cdd010bb", "authors": ["Yair Amir", "Claudiu Danilov", "Michael Kaplan", "Raluca Mus\u02d8 Aloiu-Elefteri", "Nilo Rivera"], "doi": null, "journal": null}
+{"title": "Anatomia e ultra-estrutura do pulvino prim\u00e1rio de Pterodon pubescens Benth. (Fabaceae-Faboideae)", "sha": "6d6d10587cc685f2f757c9fc55f43e7e36b9d5de", "authors": ["Revista Brasil", "V Bot"], "doi": null, "journal": null}
+{"title": "Advances in short bowel syndrome: an updated review", "sha": "6d89b088163d6d670a509927b5796af15d59f573", "authors": ["Igor Sukhotnik", "Arnold Coran", "Alexander Kramer", "Eitan Shiloni", "Jorge Mogilner"], "doi": "10.1007/s00383-005-1500-z", "journal": "Pediatric Surgery International"}
+{"title": "Neonatal FcR Expression in Bone Marrow-Derived Cells Functions to Protect Serum IgG from Catabolism", "sha": "6e2a7e0dee0c743b30cf43bbe7f2744fb6890be5", "authors": ["S Akilesh", "G Christianson", "D Roopenian", "A Shaw"], "doi": "10.4049/jimmunol.179.7.4580", "journal": "The Journal of Immunology"}
+{"title": "ICH SPRECHE ANDERS, ABER DAS IST AUCH DEUTSCH: 1 L\u00cdNGUAS EM CONFLITO EM UMA ESCOLA RURAL LOCALIZADA EM ZONA DE IMIGRA\u00c7\u00c3O NO SUL DO BRASIL ICH SPRECHE ANDERS, ABER DAS IST AUCH DEUTSCH: LANGUAGES IN CONFLICT IN A RURAL SCHOOL LOCATED IN AN IMMIGRATION AREA IN THE SOUTH OF BRAZIL", "sha": "6e39bd59f7852ea61d2325f56f52631d4233e5b5", "authors": ["Maristela Pereira Fritzen"], "doi": null, "journal": "Trab. Ling. Aplic"}
+{"title": "REGULATION IN HAPPYVILLE*", "sha": "6e4f269f091b41b23f5a421a367257ace7b74208", "authors": ["Fran\u00e7ois Salanie\u00e1nd", "Nicolas Treich"], "doi": null, "journal": null}
+{"title": "INDIANA LIBRARIES", "sha": "6ec801908ff30d575249d4c02f0ee36aec2ce380", "authors": ["I\\jpul Fl"], "doi": null, "journal": null}
+{"title": "Kinetics of Peptide Binding to the Class II MHC Protein I\u2212Ek\u2020", "sha": "6ee607f00b61606839c0618007b0144fb916160e", "authors": ["Peter Kasson", "Joshua Rabinowitz", "Lutz Schmitt", "Mark Davis", "Harden Mcconnell"], "doi": "10.1021/bi9921337", "journal": "Biochemistry"}
+{"title": "Prevalence and molecular heterogeneity of Bartonella bovis in cattle and Haemaphysalis bispinosa ticks in Peninsular Malaysia", "sha": "6f0ace0df1d70ef73d193ad4c450247d684389af", "authors": ["Kai-Ling Kho", "Fui-Xian Koh", "Tariq Jaafar", "Quaza Hassan Nizam", "Sun-Tee Tay"], "doi": "10.1186/s12917-015-0470-1", "journal": "BMC Veterinary Research"}
+{"title": "Constructing core competency indicators for clinical teachers in Taiwan: a qualitative analysis and an analytic hierarchy process", "sha": "6f0db1e4f750b204cab2722c0580c88a2940c6f4", "authors": ["Ai-Tzu Li", "Jou-Wei Lin"], "doi": null, "journal": null}
+{"title": "Novel second-language words and asymmetric lexical access", "sha": "6f433952ce2923c7eb84246b2c852a621584383f", "authors": ["Paola Escudero", "Rachel Hayes-Harb", "Holger Mitterer"], "doi": "10.1016/j.wocn.2007.11.002", "journal": "Journal of Phonetics"}
+{"title": "Scaling Laws in the Distribution of Galaxies", "sha": "6f93dac529da3ac314e0e494778593670a8342fd", "authors": ["Bernard Jones", "Vicent Mart\u00ednez", "Enn Saar", "Virginia Trimble"], "doi": null, "journal": null}
+{"title": "Comprehensive Assessment and Mathematical Modeling of T Cell Population Dynamics and Homeostasis 1", "sha": "6fb59f68b41cfa9c747f983dce4f2e195947fece", "authors": ["V\u00e9ronique Thomas-Vaslin", "\u2020 Hester", "Korthals Altes", "Rob De Boer", "\u00a7", "David Klatzmann"], "doi": null, "journal": "The Journal of Immunology"}
+{"title": "Radial and Angular Rotons in Trapped Dipolar Gases", "sha": "6fbf6f2fe595720ed6d76f1ca3f318bf5b7c9379", "authors": ["Shai Ronen", "Daniele Bortolotti", "John Bohn"], "doi": "10.1103/physrevlett.98.030406", "journal": "Physical Review Letters"}
+{"title": "Information Science at City University London", "sha": "6fdbf16f81206100806e457503d4a83558f7d075", "authors": ["David Bawden"], "doi": "10.1108/00012530710817537", "journal": "Aslib Proceedings"}
+{"title": "Status Asthmaticus Status Asthmaticus From the Emergency Department to the Intensive Care Unit", "sha": "6ff83beffaa58baf8e44b5d465b3f6ea14086192", "authors": ["Nicholas Kenyon", "Timothy Albertson"], "doi": null, "journal": "Clinical Reviews in Allergy and Immunology"}
+{"title": "The complex local mean decomposition", "sha": "701320db14d563f49d0764879594488a35303acc", "authors": ["Cheolsoo Park", "David Looney", "Marc Van Hulle", "Danilo Mandic"], "doi": "10.1016/j.neucom.2010.07.030", "journal": "Neurocomputing"}
+{"title": "Comparative analysis of the transcriptome across distant species", "sha": "70c1c270b9501f4c4c17eae8052d7caea821977f", "authors": ["Mark Gerstein", "Joel Rozowsky", "Koon-Kiu Yan", "Daifeng Wang", "Chao Cheng", "James Brown", "Carrie Davis", "Ladeana Hillier", "Cristina Sisu", "Jingyi Li", "Baikang Pei", "Arif Harmanci", "Michael Duff", "Sarah Djebali", "Roger Alexander", "Burak Alver", "Raymond Auerbach", "Kimberly Bell", "Peter Bickel", "Max Boeck", "Nathan Boley", "Benjamin Booth", "Lucy Cherbas", "Peter Cherbas", "Chao Di", "Alex Dobin", "Jorg Drenkow", "Brent Ewing", "Gang Fang", "Megan Fastuca", "Elise Feingold", "Adam Frankish", "Guanjun Gao", "Peter Good", "Roderic Guig\u00f3", "Ann Hammonds", "Jen Harrow", "Roger Hoskins", "C\u00e9dric Howald", "Long Hu", "Haiyan Huang", "Tim Hubbard", "Chau Huynh", "Sonali Jha", "Dionna Kasper", "Masaomi Kato", "Thomas Kaufman", "Robert Kitchen", "Erik Ladewig", "Julien Lagarde", "Eric Lai", "Jing Leng", "Zhi Lu", "Michael Maccoss", "Gemma May", "Rebecca Mcwhirter", "Gennifer Merrihew", "David Miller", "Ali Mortazavi", "Rabi Murad", "Brian Oliver", "Sara Olson", "Peter Park", "Michael Pazin", "Norbert Perrimon", "Dmitri Pervouchine", "Valerie Reinke", "Alexandre Reymond", "Garrett Robinson", "Anastasia Samsonova", "Gary Saunders", "Felix Schlesinger", "Anurag Sethi", "Frank Slack", "William Spencer", "Marcus Stoiber", "Pnina Strasbourger", "Andrea Tanzer", "Owen Thompson", "Kenneth Wan", "Guilin Wang", "Huaien Wang", "Kathie Watkins", "Jiayu Wen", "Kejia Wen", "Chenghai Xue", "Li Yang", "Kevin Yip", "Chris Zaleski", "Yan Zhang", "Henry Zheng", "Steven Brenner", "Brenton Graveley", "Susan Celniker", "Thomas Gingeras", "Robert Waterston"], "doi": "10.1038/nature13424", "journal": "Nature"}
+{"title": "DEVELOPMENT OF A TWO-STAGE BIOMASS COMBUSTION SYSTEM 55 DEVELOPMENT OF A TWO-STAGE BIOMASS COMBUSTION SYSTEM FOR REDUCING THE EMISSION POLLUTANT", "sha": "70d373cc1c4ef23906cce37ce290536d79353f69", "authors": ["Adi Surjosatyo", "&amp; Farid", "Nasir Ani"], "doi": null, "journal": "Jurnal Teknologi"}
+{"title": "Distribui\u00e7\u00e3o espacial da flora terrestre fanerog\u00e2mica do Parque Nacional", "sha": "70f6bb41ce86bee069440cd5bd0b62555bc0112b", "authors": ["Revista Brasil", "V Bot"], "doi": null, "journal": null}
+{"title": "P1 Cerebral autoregulation testing in a porcine model of intravenously administrated E. coli induced fulminant sepsis", "sha": "70fd708f7d6dd7c4631d4c8405ae2d2c8662b9dc", "authors": ["L Molnar", "M Berhes", "L Papp", "N Nemeth", "B Fulesdi"], "doi": null, "journal": "Critical Care"}
+{"title": "Microglia Disrupt Mesolimbic Reward Circuitry in Chronic Pain", "sha": "711538df4f98f131626c44bd6f3383a2376d9199", "authors": ["A Taylor", "A Castonguay", "A Taylor", "N Murphy", "A Ghogha", "C Cook", "L Xue", "M Olmstead", "Y De Koninck", "C Evans", "C Cahill"], "doi": "10.1523/jneurosci.4036-14.2015", "journal": "Journal of Neuroscience"}
+{"title": null, "sha": "712b2d5407f86f6e4c58f75c44fb628d213bab8d", "authors": ["X Survey O F Cancer Cases I N The Hospitals O F Bridgeport", "Conn"], "doi": null, "journal": null}
+{"title": "AN OPTIMAL ALGORITHM FOR CONSTRAINED DIFFERENTIABLE CONVEX OPTIMIZATION", "sha": "712ed5a20dfa8e63c47d0296d5766055e76026da", "authors": ["Cl\u00b4ovis Cl\u00b4", "Cl\u00b4ovis Gonzaga", "Elizabeth Karas", "Diane Rossetto"], "doi": null, "journal": null}
+{"title": "The minimal entropy measure and an Esscher transform in an incomplete market model", "sha": "713ad16e9c9c9e67dd9238ff6195b33c428e393e", "authors": ["Michael Monoyios"], "doi": null, "journal": null}
+{"title": "The Economics and Psychology of Inequality and Human Development", "sha": "71529e69d57922613b769342cf3b286b79674e10", "authors": ["Flavio Cunha", "James Heckman"], "doi": null, "journal": null}
+{"title": "On the Maximal Width of Empty Lattice Simplices", "sha": "715571c447d1461831065e301cdc257d359d73d4", "authors": ["Christian Haase", "G Unter", "M Ziegler"], "doi": null, "journal": "Europ. J. Combinatorics"}
+{"title": "Characteristics of gliomas in patients with somatic IDH mosaicism", "sha": "71d6c224b2abbf74ba03f439f946323ebaa3199b", "authors": ["Charlotte Bonnet", "Laure Thomas", "Dimitri Psimaras", "Franck Bielle", "Elodie Vaul\u00e9on", "Hugues Loiseau", "St\u00e9phanie Cartalat-Carel", "David Meyronet", "Caroline Dehais", "J\u00e9r\u00f4me Honnorat", "Marc Sanson", "Fran\u00e7ois Ducray"], "doi": "10.1186/s40478-016-0302-y", "journal": "Acta Neuropathologica Communications"}
+{"title": "SHAPED BEAM PATTERN SYNTHESIS WITH NON-UNIFORM SAMPLE PHASES", "sha": "71fe419830e4e04db40c025b93becf20334324a6", "authors": ["J Azevedo"], "doi": null, "journal": "Progress In Electromagnetics Research B"}
+{"title": "Diagnosing schistosomiasis: where are we?", "sha": "723e4fae1cf2518ac4d62db198d204174bec1075", "authors": ["Luciana Gomes", "Martin Enk", "Ana Rabello"], "doi": "10.1590/0037-8682-0231-2013", "journal": "Revista da Sociedade Brasileira de Medicina Tropical"}
+{"title": "Fetal surgery for lung lesions, congenital diaphragmatic hernia, and sacrococcygeal teratoma", "sha": "725dbb4224b73bccf9e3e7fd5ee2b28cc07e6906", "authors": ["N Adzick", "Yoshihiro Kitano"], "doi": "10.1016/s1055-8586(03)00030-1", "journal": "Seminars in Pediatric Surgery"}
+{"title": "Stability of Plasma Free Metanephrines during Collec-tion and Storage as Assessed by an Optimized HPLC Method with Electrochemical Detection", "sha": "729580d0ef1dd3831a8c513ce83d97a232ba2abe", "authors": ["Jacques Wil-Lemsen", "C", "Fred Sweep", "Jacques Lenders", "H Ross"], "doi": null, "journal": null}
+{"title": null, "sha": "72cdaa5df897570a795033a04e365f8e4d69eb74", "authors": ["Mesquita Filho Brasil", "Toledo Lima", "Renata Raposo", "Jorge Luiz", "; Virg\u00edlio", "Alex Anchieta", "Gomes Neto"], "doi": null, "journal": null}
+{"title": "Robust iterative fitting of multilinear models", "sha": "72d83fd37a0fadea9b3b6d5dc8475f7e3516ef09", "authors": ["S Vorobyov", "Yue Rong", "N Sidiropoulos", "A Gershman"], "doi": "10.1109/tsp.2005.850343", "journal": "IEEE Transactions on Signal Processing"}
+{"title": "Weierstra\u00df-Institut f\u00fcr Angewandte Analysis und Stochastik Survival and complete convergence for a spatial branching system with local regulation", "sha": "73065f6455d1d50d74d278397e0b5554469209b1", "authors": ["Im Forschungsverbund", "Berlin Preprint", "Matthias Birkner", "Andrej Depperschmidt"], "doi": null, "journal": null}
+{"title": "Regulation of STAT protein synthesis by c-Cbl", "sha": "73c1163d676473b130701c64785117d0251a77bd", "authors": ["Warren Blesofsky", "Kerri Mowen", "Robert Arduini", "Darren Baker", "Maria Murphy", "David Dl Bowtell", "Michael David"], "doi": null, "journal": null}
+{"title": "Human embryo twinning with applications in reproductive medicine", "sha": "73c7596a2b0bd851745ac54d4ed28cceefe31472", "authors": ["Karl Illmensee", "Mike Levanduski", "Andrea Vidali", "Nabil Husami", "Vasilios Goudas"], "doi": "10.1016/j.fertnstert.2008.12.098", "journal": "Fertility and Sterility"}
+{"title": "Integration of molecular and clinical data of 40 unrelated von Willebrand Disease families in a Spanish locus-specific mutation database: first release including 58 mutations", "sha": "742f25de65b4c04cc83023cc0aa1f0d40a489367", "authors": [], "doi": null, "journal": null}
+{"title": "Non-Regenerative Full Distributed Space-Time Codes in Cooperative Relaying Networks", "sha": "74cba9ac1bd67ce33459ee1c9bd9303db182adac", "authors": ["Le Quang", "Vinh Tran", "Olivier Berder", "Olivier Sentieys"], "doi": null, "journal": null}
+{"title": "Response changes of some wells in the mainland subsurface fluid monitoring network of China, due to the September 21, 1999, Ms7.6 Chi-Chi Earthquake", "sha": "7502714eda9f038f5c2b74e60598573cc081a28c", "authors": ["Fu-Qiong Huang", "Chun-Lin Jian", "Yi Tang", "Gui-Ming Xu", "Zhi-Hui Deng", "Gong-Cai Chi"], "doi": "10.1016/j.tecto.2004.03.022", "journal": "Tectonophysics"}
+{"title": "An exploratory statistical approach to depression pattern identification", "sha": "7523d057e767d56df3a620651746bdacd3620758", "authors": ["Qing Feng", "Frances Griffiths", "Nick Parsons", "Jane Gunn"], "doi": "10.1016/j.physa.2012.10.025", "journal": "Physica A: Statistical Mechanics and its Applications"}
+{"title": "Combining Oncolytic HSV-1 with Immunogenic Cell Death-Inducing Drug Mitoxantrone Breaks Cancer Immune Tolerance and Improves Therapeutic Efficacy", "sha": "753f4c2b0c450f7c474449c4a80a5a5845402db1", "authors": ["S Workenhe", "J Pol", "B Lichty", "D Cummings", "K Mossman"], "doi": "10.1158/2326-6066.cir-13-0059-t", "journal": "Cancer Immunology Research"}
+{"title": "Liquid-immersion laser micromachining of GaN grown on sapphire", "sha": "758a35b7eebf0404abd380cec1c9c6de8cc2bb4f", "authors": ["Giuseppe Mak", "Edmund Lam", "H Choi"], "doi": "10.1007/s00339-010-6169-z", "journal": "Applied Physics A"}
+{"title": "Authoritarianism and exposure to another's behavior in a risk-taking situation I", "sha": "7652b2c615ecd0da024c65b31fbb4c662308bd59", "authors": [], "doi": null, "journal": null}
+{"title": "Effect of COMT Val 108/158 Met genotype on frontal lobe function and risk for schizophrenia", "sha": "766dd93ccc4631424becd0eed6c3a472513e33ec", "authors": ["Michael Egan", "Terry Goldberg", "Bhaskar Kolachana", "Joseph Callicott", "Chiara Mazzanti", "Richard Straub", "David Goldman", "Daniel Weinberger"], "doi": null, "journal": null}
+{"title": "Surface Stoichiometry of Manganin Coatings Prepared by Pulsed Laser Deposition As Described by Laser-Induced Breakdown Spectrometry", "sha": "7675dc9b7d7874916b165045aa2271e9b695356d", "authors": ["L Cabal\u00edn", "J Laserna"], "doi": "10.1021/ac000715k", "journal": "Analytical Chemistry"}
+{"title": "Optimal energy management of an industrial consumer in liberalized markets", "sha": "76842dbdd830902efb365b4e8afb6b2fee8323b6", "authors": ["E Gomez-Villalva", "A Ramos"], "doi": "10.1109/tpwrs.2003.811197", "journal": "IEEE Transactions on Power Systems"}
+{"title": null, "sha": "7691e0a176891d6079176af20ecdf1c32ee97d81", "authors": [], "doi": null, "journal": null}
+{"title": "Stationary equilibria in discounted stochastic games with weakly interacting players", "sha": "76baee4b62be0737b9ca10d73dc8b80dbc115ad7", "authors": ["Ulrich Horst"], "doi": "10.1016/j.geb.2004.03.003", "journal": "Games and Economic Behavior"}
+{"title": "Personal Heart Monitoring and Rehabilitation System using Smart Phones", "sha": "7707c1b331f23145b821aa78b779aa31346e833c", "authors": ["Peter Leijdekkers", "Val\u00e9rie Gay"], "doi": null, "journal": null}
+{"title": "Starch as a major integrator in the regulation of plant growth", "sha": "772838f3fbdae0f0f73b2df85f4ab4c21ac98082", "authors": ["Ronan Sulpice", "Eva-Theresa Pyl", "Hirofumi Ishihara", "Sandra Trenkamp", "Matthias Steinfath", "Hanna Witucka-Wall", "Yves Gibon", "Bj\u00f6 Rn Usadel", "Fabien Poree", "Maria Concei\u00e7\u00e3 O Piques", "Maria Korff", "Marie Steinhauser", "Joost Keurentjes", "Manuela Guenther", "Melanie Hoehne", "Joachim Selbig", "Alisdair Fernie", "Thomas Altmann", "Mark Stitt"], "doi": null, "journal": null}
+{"title": "PATHOPHYSIOLOGY OF TYPE 2 DIABETES Continuing Medical Education", "sha": "777a70c62171b12671728219a386479700e7823c", "authors": ["A Scheen", "A Scheen"], "doi": null, "journal": "Acta Clinica Belgica"}
+{"title": "Identification and characterization of endophytic bacteria from corn (Zea mays L.) roots with biotechnological potential in agriculture", "sha": "780b31dc293e8e522dfae0be7c4b57e7bfb3cd3d", "authors": ["Vivian Jaskiw Szilagyi-Zecchin", "Angela Ikeda", "Mariangela Hungria", "Douglas Adamoski", "Vanessa Kava-Cordeiro", "Chirlei Glienke", "Lygia Vit\u00f3ria Galli-Terasawa"], "doi": null, "journal": null}
+{"title": "NBER WORKING PAPER SERIES FIDUCIARY DUTIES AND EQUITY-DEBTHOLDER CONFLICTS We would like to thank Lynn LoPucki for sharing data, Kathryn Chiu and Rimma Yusim for research assistance, and an anonymous referee", "sha": "7811cd858762ef9227dd81514191d85af970f1bf", "authors": ["Bo Becker", "Per Str\u00f6mberg", "Bill Allen", "Ken Ayotte", "Douglas Baird", "Carliss Baldwin", "Patrick Bolton", "Matthieu Bouvard", "Thomas Chemmanur", "John Coates", "Mihir Desai", "Alex Edmans", "Stu Gilson", "Todd Gormley", "Jeremy Graveline", "Rocco Huang", "Marcin Kacperczyk", "Mark Leary", "Michael Lemmon", "Ed Morrison", "Jeff Netter", "Raghuram Rajan", "Mark Roe", "David Scharfstein", "Albert Sheen", "Suraj Srinivasan", "Guhan Subramanian", "Jeremy Stein", "Michael Weisbach"], "doi": null, "journal": null}
+{"title": "Simulation of classical thermal states on a quantum computer: A transfer-matrix approach", "sha": "781355f713b126600cc9a1429aabf7b5b9f1d2db", "authors": ["Man-Hong Yung", "Daniel Nagaj", "James Whitfield", "Al\u00e1n Aspuru-Guzik"], "doi": "10.1103/physreva.82.060302", "journal": "Physical Review A"}
+{"title": "THE HALL INSTABILITY OF WEAKLY IONIZED, RADIALLY STRATIFIED, ROTATING DISKS", "sha": "78405cb92fef6e2fa7f0e3677024e01cb074262e", "authors": ["Edward Liverts", "Michael Mond", "Arthur Chernin"], "doi": null, "journal": null}
+{"title": "Using Predictive Analysis to Improve Invoice-to-Cash Collection", "sha": "790c79966f89615270ca4ab743f8602f110b16a9", "authors": ["Sai Zeng", "Ioana Boier-Martin", "Prem Melville", "Conrad Murphy", "Christian Lang"], "doi": null, "journal": "KDD'08"}
+{"title": "Recruitment of vimentin to the cell surface by 3 integrin and plectin mediates adhesion strength", "sha": "79458814708fddba9212b75bdbb6cf7e4e0780ac", "authors": ["R Bhattacharya", "A Gonzalez", "P Debiase", "H Trejo", "R Goldman", "F Flitney", "J Jones"], "doi": "10.1242/jcs.043042", "journal": "Journal of Cell Science"}
+{"title": "Fracture Behavior of Micro-Sized Fe-3%Si Alloy Single Crystals*", "sha": "7987b71855654ab32c7d53420111b0cbff8bd8e4", "authors": ["Eiji Taki", "Yuji Kawakami", "Masaaki Otsu", "Kazuki Takashima"], "doi": null, "journal": "Journal"}
+{"title": "Robust Prediction of Atrial Fibrillation Termination Using Wavelet Bidomain Entropy Analysis", "sha": "799ff2d72f638075b3535b9b54310b8899b67210", "authors": ["R Alcaraz", "J Rieta"], "doi": null, "journal": null}
+{"title": "Endobronchial metallic clips to guide high-dose external-beam radiotherapy in radio-occult lung cancer", "sha": "79cc73d4a6d0420ffebaa11149a97f53b3c6838b", "authors": ["T Malfait", "M Van Eijkeren", "J Van Meerbeeck", "K Tournoy"], "doi": "10.1183/09031936.00200209", "journal": "European Respiratory Journal"}
+{"title": "Vouchers in U.S. Vocational Training Programs: An Overview of What We Have Learned", "sha": "7a61a0280e257a9b0d5c501fb1283d4a93605598", "authors": ["Burt Barnow"], "doi": null, "journal": null}
+{"title": "ERK2 Contributes to the Control of Social Behaviors in Mice", "sha": "7a73195aa60c338ed051ab3623c82709ca38e9de", "authors": ["Y Satoh", "S Endo", "T Nakata", "Y Kobayashi", "K Yamada", "T Ikeda", "A Takeuchi", "T Hiramoto", "Y Watanabe", "T Kazama"], "doi": "10.1523/jneurosci.2349-11.2011", "journal": "Journal of Neuroscience"}
+{"title": "Genetic Factors in Type 2 Diabetes: The End of the Beginning?", "sha": "7b6954e63235748f3c58a64b777016e5d72baeb2", "authors": ["M Mccarthy", "P Froguel ; 2", ". Tuomilehto"], "doi": "10.1126/science.1104346", "journal": "Science"}
+{"title": "Every child counts: Universal primary education in the Middle East and North Africa", "sha": "7d195d121f3af99731fb8696cda83651a99cc89d", "authors": ["Cecilia Baldeh", "Mohamed Bile", "Farid Boubekeur", "Friedrich Huebler"], "doi": null, "journal": null}
+{"title": "Las enfermedades cr\u00f3nicas desde la mirada de los enfermos y los profesionales de la salud: Chronic illness from the perspective of patients and health professionals: a qualitative study in Mexico", "sha": "7d5464c45ecff58ff1b29ddb7d32783ea3983ce3", "authors": ["Un Estudio Cualitativo En M\u00e9xico", "Francisco Mercado-Mart\u00ednez", "Eduardo Hern\u00e1ndez-Ibarra"], "doi": null, "journal": null}
+{"title": "Indium-induced changes in GaN\"0001\u2026 surface morphology", "sha": "7dbc248ad3a1676d8b350d83257bd7aded0fb935", "authors": ["John Northrup", "J\u00f6 Rg Neugebauer"], "doi": null, "journal": null}
+{"title": "Heterogeneous Relay Selection", "sha": "7dd2e1694977fb0002610eb51c9660ca082e7e94", "authors": ["Mohamed Abouelseoud", "Aria Nosratinia"], "doi": "10.1109/twc.2013.013013.121689", "journal": "IEEE Transactions on Wireless Communications"}
+{"title": "Nonlinear Analysis of the Iterative Decoding of Parallel Concatenated Convolutional Codes", "sha": "7dd8e2ca8f4f537c9b3dd6288e8ea694b18d2098", "authors": ["F Lehmann", "G Maggio"], "doi": "10.1109/tit.2005.847751", "journal": "IEEE Transactions on Information Theory"}
+{"title": "A Calorimeter Coupled with a Magnetic Spectrometer for the Detection of Primary Cosmic Antiprotons", "sha": "7e66c70d2d2a38a3cdaf330848960e08abbee3f5", "authors": ["G Basini", "A Morselli", "M 0cchigrossi", "M Zicci", "P Spillantini", "Laboratori Nazionali Infn -Frascati", "Roma", "F Bongiorno", "P Picozza", "Laboratori Nazionali Infn -Frascati", "Roma", "A Codino", "M Menichelli", "S Bartalucci"], "doi": null, "journal": null}
+{"title": "Secure Context-sensitive Authorization", "sha": "7e9a4b04b7dc27d2deb819ea0982d6fb101ec83e", "authors": ["Kazuhiro Minami", "David Kotz"], "doi": null, "journal": null}
+{"title": "Approximate Minimum Bit Error Rate Equalization for Fading Channels", "sha": "7ef4097546b367c09ac22280987c969e5ee97f7e", "authors": ["Lorant Kovacs", "Janos Levendovszky", "Andras Olah", "Gergely Treplan"], "doi": "10.1155/2010/615623", "journal": "EURASIP Journal on Advances in Signal Processing"}
+{"title": "Suffix Tree Characterization of Maximal Motifs in Biological Sequences *", "sha": "7efc69adbe1431ffae9b0d60bce8152e8118cf3f", "authors": ["Maria Federico", "Nadia Pisanti"], "doi": null, "journal": null}
+{"title": "HEALTH POLICY ANALYSIS Implementing Pharmacoeconomic Guidelines in Latin America: Lessons Learned", "sha": "7f1129ded6d82d9098833efeda160cbde393ed4c", "authors": [], "doi": null, "journal": null}
+{"title": "The significance of cortical pathology in progressive supranuclear palsy Clinico-pathological data in 10 cases", "sha": "7f1625777c760cdffe67748cf56204cdd600b1bf", "authors": ["M Verny", "C Duyckaerts", "Y Agid", "J.-J Hauw", "/"], "doi": null, "journal": "Brain"}
+{"title": "Using Online Hotel Customer Reviews to Improve the Booking Process", "sha": "7f171be882e37850240a9759ce42172e2a51799a", "authors": ["Wojoud Al-Abdullatif", "Yasser Kotb"], "doi": null, "journal": "International Journal of Computer Applications"}
+{"title": "Understanding Sources of Dietary Phosphorus in the Treatment of Patients with Chronic Kidney Disease", "sha": "80626aacf94314dafccc8caab9b3659a035a521f", "authors": ["K Kalantar-Zadeh", "L Gutekunst", "R Mehrotra", "C Kovesdy", "R Bross", "C Shinaberger", "N Noori", "R Hirschberg", "D Benner", "A Nissenson", "J Kopple"], "doi": "10.2215/cjn.06080809", "journal": "Clinical Journal of the American Society of Nephrology"}
+{"title": "Induction of Pluripotent Stem Cells from Adult Human Fibroblasts by Defined Factors", "sha": "80a38c171da4ef9598ae1a0ffdf6f69511b8f715", "authors": ["Kazutoshi Takahashi", "Koji Tanabe", "Mari Ohnuki", "Megumi Narita", "Tomoko Ichisaka", "Kiichiro Tomoda", "Shinya Yamanaka"], "doi": "10.1016/j.cell.2007.11.019", "journal": "Cell"}
+{"title": "Estimated portion sizes in a school-aged population", "sha": "80af54fbb45386a64772e403958ac571aa4c2c0f", "authors": ["Sumaiya Patel", "Avni Vyas", "Adnan Custovic", "Clare Murray"], "doi": "10.1017/s1368980012001140", "journal": "Public Health Nutrition"}
+{"title": "Clausius versus Sackur-Tetrode entropies", "sha": "818e8e5feaac15196ae6be7ab3d0d1a9a6f605c8", "authors": ["Thomas Oikonomou", "G Baris Bagci"], "doi": null, "journal": null}
+{"title": "QUALIDADE FISIOL\u00d3GICA E COMPORTAMENTO DE SEMENTES DE SOJA (Glycine max (L.) Merrill) NO ARMAZENAMENTO E NO CAMPO*", "sha": "81c1a528f34d2e4793451a1e48e2a7c0366e57f3", "authors": ["J Filho", "R De Carvalho", "S Cicero", "C Dem\u00e9trio", "Resumo"], "doi": null, "journal": null}
+{"title": "Adsorption of Cu 2' and Ni 2' on iron oxide and kaolin and its importance on Ni 2' transport in porous media", "sha": "81d9bbba6495a7da2562105de8a776d829c1129d", "authors": ["Tushar Sen", "S Mahajan", "Kartic Khilar"], "doi": null, "journal": null}
+{"title": "41 APLICA\u00c7\u00c3O DE MODELO DE SIMULA\u00c7\u00c3O-OTIMIZA\u00c7\u00c3O NA GEST\u00c3O DE PERDA DE \u00c1GUA EM SISTEMAS DE ABASTECIMENTO LEAKAGE MANAGEMENT WITH COMPUTATIONAL MODEL IN WATER SUPPLY SYSTEM EDEVAR LUVIZOTTO JUNIOR", "sha": "82639edd392aec95427018f92c6f05a55dfdcb75", "authors": ["C Gumier", "E Luvizotto", "Sanit Ambient Eng", "32", "Carlos C\u00e9sar Gumier"], "doi": null, "journal": null}
+{"title": "Fiber Bragg grating sensor for simultaneous measurement of displacement and temperature", "sha": "829a0774aa383b4f2465857214edd810c498fc26", "authors": ["Youlong Yu", "Hwayaw Tam", "Wenghong Chung", "Muhtesem Demokan"], "doi": null, "journal": "OPTICS LETTERS"}
+{"title": "Beneficial role of tamoxifen in experimentally induced cardiac hypertrophy", "sha": "82ca239889e86a2e2af89cf1e3d6d1549717a27f", "authors": ["Bhoomika Patel", "Vishal Desai"], "doi": "10.1016/j.pharep.2014.02.004", "journal": "Pharmacological Reports"}
+{"title": "Task Parallelism and Data Distribution: An Overview of Explicit Parallel Programming Languages", "sha": "82eb4bc90c4d9a9b2f924996e3583942b4cad391", "authors": ["Dounia Khaldi", "Pierre Jouvelot", "Corinne Ancourt", "Fran\u00e7ois Irigoin"], "doi": null, "journal": null}
+{"title": "Neuropsychological abnormalities in children with the Panayiotopoulos syndrome point to parietal lobe dysfunction", "sha": "82fb17df19c7142623eb551135f9f924b911e8cf", "authors": ["Ricardo Lopes", "M\u00e1rio Sim\u00f5es", "Alberto Leal"], "doi": "10.1016/j.yebeh.2013.11.013", "journal": "Epilepsy & Behavior"}
+{"title": "Motion-Based Counter-Measures to Photo Attacks in Face Recognition", "sha": "82fbf7fe9b2114b87765e2aa62204b98a44dc89b", "authors": ["Andr\u00e9 Anjos", "Mohan Chakka", "S\u00e9bastien Marcel"], "doi": null, "journal": null}
+{"title": "A laser-ablation ICP-MS study of Apollo 15 low-titanium olivine-normative and quartz-normative mare basalts", "sha": "83712a06f6e65abf584dad268e3425aec6ff313b", "authors": ["Darren Schnare", "James Day", "Marc Norman", "Yang Liu", "Lawrence Taylor"], "doi": "10.1016/j.gca.2008.02.021", "journal": "Geochimica et Cosmochimica Acta"}
+{"title": "Translocation time of periodically forced polymer chains", "sha": "842382508576bc51de1fc86813a613a25cfee84d", "authors": ["Alessandro Fiasconaro", "Juan Mazo", "Fernando Falo"], "doi": "10.1103/physreve.82.031803", "journal": "Physical Review E"}
+{"title": "Synthesis of Moduli of Uniform Continuity by the Monotone Dialectica Interpretation in the Proof-system MinLog", "sha": "842d8ac31fa1bca3db9e6f694d8343f6f9a1e7c5", "authors": ["Mircea-Dan Hernest"], "doi": "10.1016/j.entcs.2007.01.023", "journal": "Electronic Notes in Theoretical Computer Science"}
+{"title": null, "sha": "842e64035bd43e5d9de23556e165287e6dfc9fe5", "authors": [], "doi": null, "journal": null}
+{"title": "A Service of zbw", "sha": "84b1c80eb3aae857900e6ce5c5acaeff228b1d11", "authors": [], "doi": null, "journal": null}
+{"title": "A Service of zbw Leibniz-Informationszentrum Wirtschaft Leibniz Information Centre for Economics Kurtosis modelling by means of the J-transformation Kurtosis modelling by means of the J-transformation", "sha": "84b1f1663e1f155d5c7c939d2a147e73038abfba", "authors": ["Matthias Fischer", "Klein", "Matthias Fischer", "Ingo Klein"], "doi": null, "journal": null}
+{"title": "Privacy Policy", "sha": "84d2a156f953276433d689711914203b541b32d1", "authors": [], "doi": null, "journal": null}
+{"title": "Effects of water hardness on the physiological responses to chronic waterborne silver exposure in early life stages of rainbow trout (Oncorhynchus mykiss)", "sha": "84d914fbb768e85f751e4143871999b9a3e5ff93", "authors": ["T Morgan", "C Guadagnolo", "M Grosell", "C Wood"], "doi": "10.1016/j.aquatox.2005.05.017", "journal": "Aquatic Toxicology"}
+{"title": "METRIC GRAPH RECONSTRUCTION FROM NOISY DATA", "sha": "84f74eaa6589d4f197bb57269933a5812969f163", "authors": ["Mridul Aanjaneya", "Frederic Chazal", "Daniel Chen", "Marc Glisse", "Leonidas Guibas", "Dmitriy Morozov"], "doi": null, "journal": null}
+{"title": "ORBITAL AND COLLISIONAL EVOLUTION OF THE IRREGULAR SATELLITES", "sha": "84f7667d2359096e8c69717f58412e8d1ef96f38", "authors": ["David Nesvorny\u00b4,", "Jose Alvarellos", "Luke Dones", "Harold Levison"], "doi": null, "journal": null}
+{"title": "OGLE 2008\u2013BLG\u2013290: an accurate measurement of the limb darkening of a galactic bulge K Giant spatially resolved by microlensing", "sha": "85291039170304dbd08e7676b98c925d112f24d8", "authors": ["P Fouqu\u00e9", "D Heyrovsk\u00fd", "S Dong", "A Gould", "A Udalski", "M Albrow", "V Batista", "J-P Beaulieu", "D Bennett", "I Bond", "D Bramich", "S Calchi Novati", "A Cassan", "C Coutures", "S Dieters", "M Dominik", "D Dominis Prester", "J Greenhill", "K Horne", "U J\u00f8rgensen", "S Koz\u0142owski", "D Kubas", "C-H Lee", "J-B Marquette", "M Mathiasen", "J Menzies", "L Monard", "S Nishiyama", "I Papadakis", "R Street", "T Sumi", "A Williams", "J Yee", "S Brillant", "J Caldwell", "A Cole", "K Cook", "J Donatowicz", "N Kains", "S Kane", "R Martin", "K Pollard", "K Sahu", "Y Tsapras", "J Wambsganss", "D Depoy", "B Gaudi", "C Han", "C-U Lee", "B-G Park", "M Kubiak", "M Szyma\u0144ski", "G Pietrzy\u0144ski", "I Soszy\u0144ski", "O Szewczyk", "K Ulaczyk", "F Abe", "A Fukui", "K Furusawa", "A Gilmore", "J Hearnshaw", "Y Itow", "K Kamiya", "P Kilmartin", "A Korpela", "W Lin", "C Ling", "K Masuda", "Y Matsubara", "N Miyake", "Y Muraki", "M Nagaya", "K Ohnishi", "T Okumura", "Y Perrott", "N Rattenbury", "T Saito", "T Sako", "S Sato", "L Skuljan", "D Sullivan", "W Sweatman", "P Tristram", "A Allan", "M Bode", "M Burgdorf", "N Clay", "S Fraser", "E Hawkins", "E Kerins", "T Lister", "C Mottram", "E Saunders", "C Snodgrass", "I Steele", "T Anguita", "V Bozza", "K Harps\u00f8e", "T Hinse", "M Hundertmark", "P Kj\u00e6rgaard", "C Liebig", "L Mancini", "G Masi", "S Rahvar", "D Ricci", "G Scarpetta", "J Southworth", "J Surdej", "C Th\u00f6ne", "A Riffeser", "S Seitz"], "doi": "10.1051/0004-6361/201014053", "journal": "Astronomy and Astrophysics"}
+{"title": "Layout Synthesis Algorithm of Embedded Passive Components for RF and EMC Reliable System Design", "sha": "85a1869dcb018ff286d520b3b110fbac48be2604", "authors": ["Grit Wemer", "John Reichl"], "doi": null, "journal": null}
+{"title": "ON-LINE SEVERITY ASSESSMENT OF BEARING DAMAGE VIA DEFECT SENSITIVE RESONANCE IDENTIFICATION AND MATCHED FILTERING", "sha": "85e973e01c378aa42da7ac7d6ea9f7dbdea239c1", "authors": ["C Lx", "S Wu"], "doi": null, "journal": "Mechanical Systems and Signal Processing"}
+{"title": "Printed in the United States of America", "sha": "8654b179494575734588266b53fdd550396805ba", "authors": ["Jennifer Huang", "Jiang Wang"], "doi": null, "journal": "Macroeconomic Dynamics"}
+{"title": "Infectious Diseases Society of America Guidelines for the Diagnosis and Treatment of Asymptomatic Bacteriuria in Adults SUMMARY OF RECOMMENDATIONS", "sha": "86c645ff1be7f1b4108582c98fabf4bea67ef832", "authors": ["\u2022 Idsa Guidelines For Asymptomatic Bacteriuria", "Cid"], "doi": null, "journal": null}
+{"title": "M P RA Munich Personal RePEc Archive Keynesian Beauty Contest, Accounting Disclosure, and Market Efficiency", "sha": "86efc1aab363f3f24ae8ca2c796d1d6340faad43", "authors": ["Pingyang Gao"], "doi": null, "journal": null}
+{"title": "Sex differences in the pattern of innominate motion during passive hip abduction and external rotation", "sha": "870c08ced2d2d9fe828ceecac51ce62b5b32c89d", "authors": ["Melanie Bussey", "Stephan Milosavljevic", "Melanie Bell"], "doi": "10.1016/j.math.2008.09.004", "journal": "Manual Therapy"}
+{"title": "THE INTERRELATIONSHIP OF CELL GROWTH AND DIVISION IN HAPLOID AND DIPLOID CELL OF SACCHAROMYCES", "sha": "871e77317d6b18c2ab1945203be0ca35a4b9d9a3", "authors": ["Julian Cerevisiae", "Adams"], "doi": null, "journal": "Experimental Cell Research"}
+{"title": "Regularized Structured Output Learning with Partial Labels", "sha": "8771f2298a70d7939e47bd195eb34b3bcd9b4e3b", "authors": ["Sundararajan Sellamanickam", "Charu Tiwari", "Sathiya Selvaraj"], "doi": null, "journal": null}
+{"title": "Using Speed Diagrams for Symbolic Quality Management", "sha": "87b6c263c287803c88b55ea8511b937d78443ed9", "authors": ["Jacques Combaz", "Jean-Claude Fernandez", "Joseph Sifakis", "Loic Strus"], "doi": null, "journal": null}
+{"title": null, "sha": "87c0e6c3e6f71bf1dd1c01c120f4274ae4a12268", "authors": [], "doi": null, "journal": null}
+{"title": "Neurobehavioral Function in School-Age Children Exposed to Manganese in Drinking Water", "sha": "87c3f9e7bfe5800202f956f7c7ce34705e5d112d", "authors": ["Citation Oulhote", "Donna Mergler", "Benoit Barbeau", "David Bellinger", "Th\u00e9r\u00e8se Bouffard", "Marie-\u00c8ve Brodeur", "Dave Saint-Amour", "Melissa Legrand", "S\u00e9bastien Sauv\u00e9", "Maryse"], "doi": null, "journal": "Environmental Health Perspectives \u2022"}
+{"title": "Structural Modeling of Protein Interactions by Analogy: Application to PSD-95", "sha": "87e576472d0676dd2e3140ae077cba3f80f6c96f", "authors": ["Dmitry Korkin", "Fred Davis", "Frank Alber", "Tinh Luong", "Min-Yi Shen", "Vladan Lucic", "Mary Kennedy", "Andrej Sali"], "doi": null, "journal": null}
+{"title": "Evaluating digital elevation models for glaciologic applications: An example from Nevado Coropuna, Peruvian Andes", "sha": "88515146b39c3f7f7ebb15cf14207c6b2909e0d0", "authors": ["Adina Racoviteanu", "William Manley", "Yves Arnaud", "Mark Williams"], "doi": "10.1016/j.gloplacha.2006.11.036", "journal": "Global and Planetary Change"}
+{"title": "Modeling the size and shape of Saturn's magnetopause with variable dynamic pressure", "sha": "885302ad73ff70e1a8755c757cd0f87cdfa8d4d9", "authors": ["C Arridge", "N Achilleos", "M Dougherty", "K Khurana", "C Russell"], "doi": "10.1029/2005ja011574", "journal": "Journal of Geophysical Research"}
+{"title": "Observation of high-order harmonic generation in a bulk crystal", "sha": "88c97d23f5763189225a8adc73fa8fd2dbb2572d", "authors": ["Shambhu Ghimire", "Anthony Dichiara", "Emily Sistrunk", "Pierre Agostini", "Louis Dimauro", "David Reis"], "doi": "10.1038/nphys1847", "journal": "Nature Physics"}
+{"title": "Amazon Floodplain Water Level Changes Measured with Interferometric SIR-C Radar", "sha": "88f3324c1b6901ca52f36a063d590df9c38796b0", "authors": ["Douglas Alsdorf", "Laurence Smith", "John Melack"], "doi": null, "journal": "IEEE TRANSACTIONS ON GEOSCIENCE AND REMOTE SENSING"}
+{"title": "Anti-diabetic Action of 7-O-Galloyl-d-sedoheptulose, a Polyphenol from Corni Fructus, through Ameliorating Inflammation and Inflammation-Related Oxidative Stress in the Pancreas of Type 2 Diabetics MATeRIAlS ANd MeThOdS Materials Protease inhibitor mixture solution, 4,6-dihy-droxy-2-mercaptopyrimidine (2-thiobarbituric acid, TBA), and 10% neutral-buffered formalin were purchased from Wako", "sha": "892b7a96ea8da0649dc1f0b302f6a9b9b2a59c0d", "authors": ["Chan Park", "Takashi Tanaka", "Takako Yokozawa"], "doi": null, "journal": "Biol. Pharm. Bull"}
+{"title": "1 1 1 1 RESENHAS/BOOK REVIEWS", "sha": "8954de4f74c70410407678845565d2e279d6fd73", "authors": ["Pedro Castelo", "Branco Silveira", "Katrina Brandon", "Kent Redford", "Steven Sanderson"], "doi": null, "journal": null}
+{"title": "Demonstration of conditional gate operation using superconducting charge qubits", "sha": "897b373bdc60ef18d77f9df304d88bb4558ba597", "authors": ["T Yamamoto", "Yu Pashkin", "O Astafiev", "Y Nakamura", "J Tsai"], "doi": null, "journal": null}
+{"title": "DECOMPOSABLE CHAINABLE CONTINUAO", "sha": "899476c2dc5159a665f4ad95c481b82ef50f0ea5", "authors": ["J F\u00fagate"], "doi": null, "journal": null}
+{"title": "Characteristics of III-nitride photodiodes with self-assembled quantum dots", "sha": "89fae62792497b43e10a7e6cf597b5a090c6d051", "authors": ["Liang-Wen Ji", "Te-Hua Fang", "Sheng-Joue Young", "Chi-Chung Liu", "Yin-Lai Chai"], "doi": "10.1016/j.matlet.2006.07.104", "journal": "Materials Letters"}
+{"title": "Effects of Nutrient Restrictions on Confined Animal Facilities: Insights from a Structural Model", "sha": "8a7273bb67302887df0ba4bdff2fd74308225761", "authors": ["Kenneth Baerenklau", "Nermin Nergis", "Kurt Schwabe"], "doi": null, "journal": null}
+{"title": "Restructuring of COBOL=CICS legacy systems", "sha": "8a755d7b29cefeb2c86ca32418b910302cf598d2", "authors": ["Alex Sellink", "Harry Sneed", "Chris Verhoef"], "doi": null, "journal": "Science of Computer Programming"}
+{"title": "Impact and cost of a 2-week community-based screening and awareness program for diabetes and cardiovascular risk factors in a Swiss canton", "sha": "8aeef1e9453786a1b5d76b0c68e3ad4f2978ad16", "authors": ["Pascal Bovet", "Hirsiger", "Emery", "De Bernardini", "Rossier", "Trebeljahr", "Hagon"], "doi": "10.2147/dmso.s20649", "journal": "Diabetes, Metabolic Syndrome and Obesity: Targets and Therapy"}
+{"title": "87258 Links in a Distributed Database: Theory and Implementation", "sha": "8affbc8f2aa4504802548c967aa4014f641645e6", "authors": ["Nicholas Karonis", "Martin Kraimerr"], "doi": null, "journal": null}
+{"title": "QoS Management in Fixed Broadband Residential Gateways", "sha": "8b079f9f5ff40d82e9af308b7884b191541c9c3b", "authors": ["C Guerrero", "J Garcia", "F Valera", "A Azcorra"], "doi": null, "journal": "LNCS"}
+{"title": "Effectiveness of a cognitive-behavioral group intervention for knee osteoarthritis pain: protocol of a randomized controlled trial", "sha": "8ba5622d58b0807dde1e943483f2cf5720fd1ea7", "authors": ["Eeva-Eerika Helminen", "Sanna Sinikallio", "Anna Valjakka", "Rauni V\u00e4is\u00e4nen-Rouvali", "Jari Arokoski"], "doi": null, "journal": null}
+{"title": "Long-term debt and hidden borrowing *", "sha": "8bfa0ee563338f4bc0de961e321232d3adb9d6c5", "authors": ["Heski Bar-Isaac", "Vicente Cu\u00f1at", "Ken Ayotte", "Alberto Bisin", "Patrick Bolton", "Leonardo Felli", "Charles Goodhart", "John Moore", "Tomasz Piskorski", "Malcom Wardlaw", "Larry White"], "doi": null, "journal": null}
+{"title": "Silicon isotopes in spring Southern Ocean diatoms: Large zonal changes despite homogeneity among size fractions", "sha": "8c165f73a17388cac53ec899002a5a72bbbbbab0", "authors": ["Damien Cardinal", "Nicolas Savoye", "Thomas Trull", "Frank Dehairs", "Elzbieta Kopczynska", "Fran\u00e7ois Fripiat", "Jean-Louis Tison", "Luc Andr\u00e9"], "doi": "10.1016/j.marchem.2006.04.006", "journal": "Marine Chemistry"}
+{"title": "Mycorrhizal Networks: Common Goods of Plants Shared under Unequal Terms of Trade", "sha": "8c8ac13f8d8cbd11f824ea687cfef138c2bc7f3f", "authors": ["F Walder", "H Niemann", "M Natarajan", "M Lehmann", "T Boller", "A Wiemken"], "doi": "10.1104/pp.112.195727", "journal": "PLANT PHYSIOLOGY"}
+{"title": "Apples, oranges and fruit salad: A Delphi study of the IMC educational mix", "sha": "8d295058a627d1d1b8a30c0cab5db32003326d9f", "authors": ["Gayle Kerr"], "doi": "10.1080/13527260902757522", "journal": "Journal of Marketing Communications"}
+{"title": "Brune sections in the non-stationary case", "sha": "8d36bf564e0445ea60c8229927750060106e6baa", "authors": ["Daniel Alpay", "Vladimir Bolotnikov", "Patrick Dewilde", "Aad Dijksma"], "doi": null, "journal": "Linear Algebra and its Applications"}
+{"title": "\"Othering\" the health worker: self-stigmatization of HIV/AIDS care among health workers in Swaziland", "sha": "8d99a5f0b3a0f1d8f09ca6c18158d50fb5fc3fa3", "authors": ["Daniel De Vries", "Shannon Galvin", "\u2020", "Masitsela Mhlanga", "Brian Cindzi", "Thabsile Dlamini"], "doi": null, "journal": null}
+{"title": "Spatial Analysis of the Labour Market by Using Econometric Tools. The Case of Lower Silesia Region (Dolno\u015bl\u0105skie Voivodship)", "sha": "8e0bdf56e396d86a2b4e2615f92ce4d49219ce19", "authors": ["El\u017cbieta Litwi\u0144ska"], "doi": "10.2478/v10103-012-0032-8", "journal": "Comparative Economic Research"}
+{"title": "Article Natural History of Male Psychological Health, XV: Retirement Satisfaction", "sha": "8eb45e31369a1e2642f9b5d4259318ba571e694d", "authors": ["George Vaillant", "C Ana", "Sc Dirago", "Mukamal"], "doi": null, "journal": "Am J Psychiatry"}
+{"title": "Christensen measurability and some functional equation", "sha": "8f0a1f76023ceafbf0fa37be1a03862677db26ec", "authors": ["Eliza Jabb Lo\u00b4nskalo\u00b4nska"], "doi": "10.1007/s00010-010-0056-8", "journal": "Aequationes mathematicae"}
+{"title": "A Search for Bright Kuiper Belt Objects", "sha": "8f16b112bd2b4392b722eff28e3cdc74c9e2a12a", "authors": ["Michael Brown", "R Webster"], "doi": null, "journal": "Publ. Astron. Soc. Aust"}
+{"title": null, "sha": "8f36171348932f1cda47f46004fb981954e9a164", "authors": ["S\u00edlvio Dahmen"], "doi": null, "journal": null}
+{"title": "Cochrane corner: is integrated disease management for patients with COPD effective?: Table 1", "sha": "8f6e0cbc715d86cad1698e18df5e4aa2ac2022c3", "authors": ["Annemarije Kruis", "Nynke Smidt", "Willem Assendelft", "Jacobijn Gussekloo", "Melinde Boland", "Maureen Rutten-Van M\u00f6lken", "Niels Chavannes"], "doi": "10.1136/thoraxjnl-2013-204974", "journal": "Thorax"}
+{"title": "Movement before Cinematography: The High-Speed Qualities of Sentiment", "sha": "8f9325389c0d7d76276b982a01f80d75dde44dfa", "authors": ["Jimena Canales"], "doi": "10.1177/1470412906070518", "journal": "Journal of Visual Culture"}
+{"title": "What are Authentic Pharmaceuticals Worth?", "sha": "8fd9e7d8939f13b2c1d2863a80349f2b4d37f5f7", "authors": ["Matthieu Schapranow", "J\u00fcrgen M\u00fcller", "Martin Lorenz", "Alexander Zeier", "Hasso Plattner"], "doi": null, "journal": null}
+{"title": "USING HIGH-RESOLUTION OPTICAL SPECTRA TO MEASURE INTRINSIC PROPERTIES OF LOW-MASS STARS: NEW PROPERTIES FOR KOI-314 AND GJ 3470", "sha": "90010bc519bf52056fd60a89ef534b79f626b73f", "authors": ["J Pineda", "Michael Bottom", "John Johnson"], "doi": "10.1088/0004-637x/767/1/28", "journal": "The Astrophysical Journal"}
+{"title": "THE LINEAR HEAT EQUATION WITH HIGHLY OSCILLATING POTENTIAL", "sha": "900b1683a8a8fb212958c2730a3268fd38834edd", "authors": ["Ismail Kombe"], "doi": null, "journal": "PROCEEDINGS OF THE AMERICAN MATHEMATICAL SOCIETY"}
+{"title": "Gastrointestinal stromal tumors: a clinicopathological and immunohistochemical study of 121 cases", "sha": "900c40d08c7dabc92a530be45f730234f662ab8d", "authors": ["Mukul Vij", "Vinita Agrawal", "Ashok Kumar", "Rakesh Pandey"], "doi": "10.1007/s12664-010-0079-z", "journal": "Indian Journal of Gastroenterology"}
+{"title": "Software Piracy and the Doris Day Syndrome: Some Legal, Ethical and Social Implications of Contemporary Conceptions of Property", "sha": "9039ac28aadc88bc71411c7d42be377cb10428c9", "authors": ["James Couser"], "doi": null, "journal": "International Journal of Law and Information Technology"}
+{"title": "Ad Hoc On-Demand Backup Node Setup Routing Protocol", "sha": "9052c89793990fb392b0e037043f8fa686067af3", "authors": ["Ying-Hong Wang", "Chih-Chieh Chuang"], "doi": null, "journal": "JOURNAL OF INFORMATION SCIENCE AND ENGINEERING"}
+{"title": "The Nursing Worklife Model: Extending and Refining a New Theory", "sha": "906f6f19db9ae0e0d4ad9f7b52723c0715d0f82b", "authors": ["Milisa Manojlovich P H D , R N , C C R N", "Heather Laschinger"], "doi": null, "journal": null}
+{"title": "A SIMULATION MODEL TO ANALYZE THE IMPACT OF HOLE SIZE ON PUTTING IN GOLF", "sha": "9071b70f43757a38d626c560da99eb23229ac266", "authors": ["Matulya Bansal", "Mark Broadie"], "doi": null, "journal": null}
+{"title": "Agent-Oriented Material Flow Control System Based on DCOM", "sha": "90ea42b46f9b4740e291a5e95ff75ce2c3fc42e3", "authors": ["Ronald Schoop", "Ralf Neubert"], "doi": null, "journal": null}
+{"title": "Sulphated AlMCM-41: Mesoporous solid Br\u00f8nsted acid catalyst for dibenzoylation of biphenyl", "sha": "90ee57d508a4acca3208c223b9e93f762c8624f2", "authors": ["Ng Poh", "Hadi Nur", "Mohd Muhid", "Halimaton Hamdan"], "doi": "10.1016/j.cattod.2006.01.010", "journal": "Catalysis Today"}
+{"title": "Ketjenblack Carbon Supported Amorphous Manganese Oxides Nanowires as Highly Efficient Electrocatalyst for Oxygen Reduction Reaction in Alkaline Solutions", "sha": "90f1d4fc3979b0b009a7f4a057d8e1be65321026", "authors": ["Jang-Soo Lee", "Gi Park", "Ho Lee", "Sun Kim", "Ruiguo Cao", "Meilin Liu", "Jaephil Cho"], "doi": "10.1021/nl2029078", "journal": "Nano Letters"}
+{"title": "The Language Observatory Project (LOP)", "sha": "914efcd0fd5e2b6a14bf63bcba1d2b159edd139e", "authors": ["Yoshiki Mikami", "Pavol Zavarsky", "Mohd Zaidi", "Abd Rozan", "Izumi Suzuki", "Masayuki Takahashi", "Tomohide Maki", "Irwan Ayob", "Paolo Boldi", "Massimo Santini", "Sebastiano Vigna"], "doi": null, "journal": null}
+{"title": "Sustaining Fisheries Yields Over Evolutionary Time Scales", "sha": "9168b940960b29e950787ec33b8e93cc9f67d9c3", "authors": ["David Conover", "Stephan Munch"], "doi": null, "journal": null}
+{"title": "Stability, participation and transparency in renewable energy policy: Lessons from Denmark and the United States", "sha": "91ca1537ae9c9c573a81cd8ac5ec5ffbad28881d", "authors": ["Miguel Mendon\u00e7a", "Stephen Lacey", "Frede Hvelplund"], "doi": "10.1016/j.polsoc.2009.01.007", "journal": "Policy and Society"}
+{"title": "The Influence of Physical Attractiveness and Gender on Ultimatum Game Decisions", "sha": "9211c793469038bc33c6ef5bf424a6317a539805", "authors": ["Sara Solnick", "Maurice Schweitzer"], "doi": null, "journal": "Organizational Behavior and Human Decision Processes"}
+{"title": "TWO-DIMENSIONAL STEADY-STATE OSCILLATION PROBLEMS OF ANISOTROPIC ELASTICITY", "sha": "9223a2fca341bcb86e3e476d1774e4e282c28532", "authors": ["D Natroshvili"], "doi": null, "journal": null}
+{"title": "Full-Search-Equivalent Pattern Matching with Incremental Dissimilarity Approximations", "sha": "92488ad800943b4e0488331e468120b3e41baa6d", "authors": ["F Tombari", "S Mattoccia", "L Di Stefano"], "doi": "10.1109/tpami.2008.46", "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence"}
+{"title": "Switching-mode-dependent magnetic interlayer coupling strength in spin valves and magnetic tunnel junctions", "sha": "927157219b7690a020e90c12e954ba21cd1625f2", "authors": ["Y Pennec", "J Camarero", "J Toussaint", "S Pizzini", "M Bonfim", "F Petroff", "W Kuch", "F Offi", "K Fukumoto", "F Nguyen Van Dau", "J Vogel"], "doi": "10.1103/physrevb.69.180402", "journal": "Physical Review B"}
+{"title": "Size\u2013sound symbolism revisited", "sha": "9283e341dbecdb3dc220c0a0f575c48fc64102c8", "authors": ["Professor Emeritus", "Hebrew Literature"], "doi": "10.1016/j.pragma.2005.12.002", "journal": "Journal of Pragmatics"}
+{"title": "Estimation and Regularization Techniques for Regression Models with Multidimensional Prediction Functions Estimation and Regularization Techniques for Regression Models with Multidimensional Prediction Functions", "sha": "92c2868b73f2d1f4701296c07b5c823eed437acf", "authors": ["Matthias Schmid", "Sergej Potapov", "Annette Pfahlberg", "Torsten Hothorn", "Matthias Schmid", "Sergej Potapov", "Annette Pfahlberg", "Torsten Hothorn"], "doi": null, "journal": null}
+{"title": "Allometry of natural mortality as a basis for assessing optimal release size in fish-stocking programmes", "sha": "92ec6a2aa0581d32ac90c4184e4b94b221ec5a68", "authors": ["Kai Lorenzen"], "doi": null, "journal": null}
+{"title": "Intercellular Trogocytosis Plays an Important Role in Modulation of Immune Responses", "sha": "930317934dfdd0e407fa648248bf163da6b55ab1", "authors": ["Khawaja Ahmed", "Manjunatha Munegowda", "Yufeng Xie", "Jim Xiang"], "doi": null, "journal": null}
+{"title": "Modelling treatment effects in a clinical Bayesian network using Boolean threshold functions", "sha": "930cf107d9826ef14efe776f073198eb72a7a47d", "authors": ["Stefan Visscher", "Peter Lucas", "Carolina Schurink", "Marc Bonten"], "doi": "10.1016/j.artmed.2008.11.006", "journal": "Artificial Intelligence in Medicine"}
+{"title": "Ethical aspects of clinical chemistry", "sha": "93a297d7f9aeb9280d6137b71a34168b8b48623d", "authors": ["Ezra Bengershom"], "doi": null, "journal": "Journal ofmedical ethics"}
+{"title": "Proactive problem-solver for construction", "sha": "93bad2054e1d64a6eb4b5ae3bf23b30a53042e67", "authors": ["Wen-Der Yu", "Jyh-Bin Yang", "Judy Tseng", "Shen-Jung Liu", "Ji-Wei Wu"], "doi": "10.1016/j.autcon.2010.05.003", "journal": "Automation in Construction"}
+{"title": "BRIEF REPORT Hypophysectomy Reduces Behavioral Activation to Morphine in the Rat", "sha": "93e3d3d702205a8362059da015da1f70075c1e46", "authors": ["R Katz"], "doi": null, "journal": "BEHAVIORAL AND NEURAL BIOLOGY"}
+{"title": "Gap colonization in the Patagonian semidesert: seed bank and diaspore morphology", "sha": "944d04e68cf1c5b3a854c5ac11d1307b8015e13a", "authors": ["Roberto Fern\u00e1ndez", "Rodolfo Golluscio", "Alejandro Bisigato", "Alberto Soriano C Fern\u00e1ndez", "R Golluscio", "R Bisigato", "A Soriano", "A Gap"], "doi": null, "journal": "ECOGRAPHY"}
+{"title": null, "sha": "949111c18bec095c2e707c36088463262a9ce44c", "authors": ["Rachna Begh", "Marcus Munaf\u00f2", "Saul Shiffman", "Stuart Ferguson", "Linda Nichols", "Mohammed Mohammed", "Roger Holder", "Stephen Sutton", "Paul Aveyard"], "doi": null, "journal": null}
+{"title": "SPECTROPHOTOMETRIC DETERMINATION OF THE OPTICAL PROPERTIES OF AN ADSORBED OXYGEN LAYER ON GOLD", "sha": "949b283c8a0291c0742f035d9aab977626198068", "authors": ["D Kolb", "J Mcintyre"], "doi": null, "journal": "SURFACE SCIENCE"}
+{"title": "Telaprevir with Peginterferon and Ribavirin for Chronic HCV Genotype 1 Infection A BS TR AC T Background", "sha": "94dd3afc470b081b977c5b81d70749894fa2dd4c", "authors": ["John Mchutchison", "Gregory Everson", "Stuart Gordon", "Ira Jacobson", "Mark Sulkowski", "Robert Kauffman", "Lindsay Mcnair", "John Alam", "Andrew Muir"], "doi": null, "journal": null}
+{"title": "Monozygotic twin sisters discordant for familial hemiplegic migraine", "sha": "952059f84aa04b11f2ce71192a959af8f273fc78", "authors": ["Jos\u00e9 Barros", "Rui Barreto", "Ana Brand\u00e3o", "Joana Domingos", "Joana Dam\u00e1sio", "Cristina Ramos", "Carolina Lemos", "Jorge Sequeiros", "Isabel Alonso", "Jos\u00e9 Pereira-Monteiro"], "doi": null, "journal": null}
+{"title": "Phylogeny for the faint of heart: a tutorial", "sha": "95454927e0a1ccbf6260feb3e08c060b5bd82450", "authors": ["Sandra Baldauf"], "doi": "10.1016/s0168-9525(03)00112-4", "journal": "Trends in Genetics"}
+{"title": "Information provision, policy support, and farmers\u2019 adaptive responses against drought: An empirical study in the North China Plain", "sha": "95c625fa375bd44592a89a505a1305f4303dae3d", "authors": ["Jinxia Wang", "Yu Yang", "Jikun Huang", "Kevin Chen"], "doi": "10.1016/j.ecolmodel.2014.12.013", "journal": "Ecological Modelling"}
+{"title": "Is public R & D a complement or substitute for private R & D? A review of the econometric evidence", "sha": "95d4b80a3935564369a41b0cf6bab2b2c6d76968", "authors": ["Paul David", "Bronwyn Hall", "Andrew Toole"], "doi": null, "journal": "Research Policy"}
+{"title": "Nanomechanical properties of GaSe thin films deposited on Si(111) substrates by pulsed laser deposition", "sha": "95ea4042276e558b89c04abd80d292b35e2688c3", "authors": ["Sheng-Rui Jian", "Jenh-Yih Juang", "Chih-Wei Luo", "Shin-An Ku", "Kaung-Hsiung Wu"], "doi": "10.1016/j.jallcom.2012.07.089", "journal": "Journal of Alloys and Compounds"}
+{"title": "High singleton live birth rate following classical ovulation induction in normogonadotrophic anovulatory infertility (WHO 2)", "sha": "95f8cbc8239660d8f7b2d79bfd517885286ee2da", "authors": ["Marinus Eijkemans", "Babak Imani", "Annemarie Mulders", "J Dik", "F Habbema", "Bart Fauser"], "doi": "10.1093/humrep/deg459", "journal": "Human Reproduction"}
+{"title": "Introducing high performance distributed logging service for ACS", "sha": "96144660134808afbc2e48b5361e2941bd32a8b6", "authors": ["Jorge Avarias", "Joao L\u00f3pez", "Crist\u00edan Maureira", "Heiko Sommer", "Gianluca Chiozzi"], "doi": null, "journal": null}
+{"title": "Minute Virus of Mice Non-structural Protein NS-1 Is Necessary and Sufficient for Trans-activation of the Viral P39 Promoter", "sha": "961c96ea3b100d9eadf063aaeb5ceab46c8b2bee", "authors": ["Christian Doerig", "Bernhard Hirt", "Peter", "Jean-Philippe Antonietti"], "doi": null, "journal": "J. gen. Virol"}
+{"title": "Approximation of Event Probabilities in Noisy Cellular Processes", "sha": "9626ee305020010e52fe72ea8e08cfb16a584afa", "authors": ["Fr\u00e9d\u00e9ric Didier", "Thomas Henzinger", "Maria Mateescu", "Verena Wolf"], "doi": null, "journal": null}
+{"title": "Model Based On-line Energy Prediction System for Semi-Autonomous Mobile Robots", "sha": "968e14596309da395dd292e461388812a391aaf3", "authors": ["Ramviyas Parasuraman", "Keith Kershaw", "Prithvi Pagala", "Manuel Ferre"], "doi": null, "journal": null}
+{"title": "Operative Decisions for Endoscopic Treatment of Cubital Tunnel Syndrome", "sha": "971b7185c1de1b6cc7eba9c7ca3f7ff9ea2e8245", "authors": ["Hans-Georg Damert", "Silke Altmann", "Manfred Infanger", "Armin Kraus"], "doi": "10.3928/01477447-20130426-04", "journal": "Orthopedics"}
+{"title": "A New Dynamic Model for the Kink Effect in InAlAs/InGaAs HEMTs", "sha": "975e033890ab9752b51e6c166263e448fe99da25", "authors": ["Mark Somerville", "Alexander Ernst", "Jes6s Del Alamo"], "doi": null, "journal": null}
+{"title": "Reconstructing Detailed Dynamic Face Geometry from Monocular Video", "sha": "976208323fd68f403e3d7c66f66d39f8788fe24c", "authors": ["Pablo Garrido", "Levi Valgaerts", "\u2020 Wu", "Christian Theobalt"], "doi": null, "journal": null}
+{"title": "A Service of zbw Stock market development and economic growth", "sha": "98171049ce2a539648e80feb64f8b206a6435165", "authors": [], "doi": null, "journal": null}
+{"title": "Constructing Optimal Policies for Agents with Constrained Architectures", "sha": "984ee076390652b3d157af8f12b639720d60e721", "authors": ["Dmitri Dolgov", "Edmund Durfee"], "doi": null, "journal": null}
+{"title": "The Connection between the Depopulation of Localities and Passenger Rail Services in the Province of Buenos Aires in Argentina between 1960 and 2009", "sha": "98592cd4965ffcd353a94f83f1f8c23cf56a4c01", "authors": ["Juan Manuel Diez-Tetamanti", "Melisa Pontrelli-Albisetti", "M Pontrelli-Albisetti", "Juan Manuel Diez-Tetamanti", "Melisa Pontrelli", "Albisetti"], "doi": "10.11648/j.ss.20130204.12", "journal": "Social Sciences"}
+{"title": "The DNA SET: a novel device for single-molecule DNA sequencing", "sha": "9859deb2b021db124f8c61ec3cb7b1ab52bfdb4d", "authors": ["P Mali", "R Lal"], "doi": "10.1109/ted.2004.839740", "journal": "IEEE Transactions on Electron Devices"}
+{"title": "LARGE SCALE SIMULATIONS OF MELTING IN TWO FOR A METASTABLE HEXATIC PHASE DIMENSIONAL LENNARD-JONES SYSTEMS: EVIDENCE", "sha": "987cf8f9536b71ac51abe4c0cbdecad5fab2fd2a", "authors": ["Kun Chen", "Theodore Kaplan", "Mark Mostoller"], "doi": null, "journal": null}
+{"title": "EVALUATING JACQUET'S GL(n) WHITTAKER FUNCTION", "sha": "988634dd20bd765cf6493c1496136d0b59ed6e3e", "authors": ["Kevin Broughan"], "doi": null, "journal": "MATHEMATICS OF COMPUTATION"}
+{"title": "A Service of zbw", "sha": "98b6b1c1fb262edb1dd3e3679593acb266b60f3a", "authors": [], "doi": null, "journal": null}
+{"title": "Relations Between Stock Returns and Fundamental Variables: Evidence from a Segmented Market", "sha": "98ce00501f3762feea38d2e8b7d24ac4c9d35707", "authors": ["Manjeet Dhatt", "Yong Kim", "Sandip Mukherji"], "doi": null, "journal": "Asia-Pacific Financial Markets"}
+{"title": "Testicular torsion: Orchiectomy or orchiopexy?", "sha": "993a6cccd834585a2804b29a826fb984cd83d603", "authors": ["Seppo Taskinen", "Mervi Taskinen", "Risto Rintala"], "doi": "10.1016/j.jpurol.2007.11.007", "journal": "Journal of Pediatric Urology"}
+{"title": null, "sha": "994fd2707774397a3c1fde6e7ac3d9661ce83c61", "authors": [], "doi": null, "journal": null}
+{"title": "A High Resolution First Order Noise-Shaping Vernier Time-to-Digital Converter", "sha": "99ccdbabfaafa1f0384f7b9ada265195a1fa53a8", "authors": ["Majid Memarian Sorkhabi", "Siroos Toofan"], "doi": "10.12691/ajeee-1-2-1", "journal": "American Journal of Electrical and Electronic Engineering"}
+{"title": "Platelet Endothelial Cell Adhesion Molecule-1 Mediates Endothelial-Cardiomyocyte Communication and Regulates Cardiac Function", "sha": "99eb13c577158707929f26be627c28a05ecb03fc", "authors": ["Margaret Mccormick", "Caitlin Collins", "Catherine Makarewich", "Zhongming Chen", "Mauricio Rojas", "Monte Willis", "Steven Houser", "Ellie Tzima"], "doi": null, "journal": null}
+{"title": "[Cu(4-oxopyrimidinate)2\u00b7nH2O]\u221e: a robust sodalite type metal-organic framework exhibiting a rich host\u2013guest chemistry", "sha": "9a1109c28d3738f10da562be2e0ac83d1dc11979", "authors": ["Elisa Barea", "Jorge Navarro", "Juan Salas", "Norberto Masciocchi", "Simona Galli", "Angelo Sironi"], "doi": "10.1016/j.poly.2002.08.001", "journal": "Polyhedron"}
+{"title": "56 SCTP: A Proposed Standard for Robust Internet Data Transport", "sha": "9a140ce01c1f4404a9437fb720ba1c2fe4f162d1", "authors": ["Armando Caro", "R Janardhan", "Paul Iyengar", "Sourabh Amer", "Gerard Ladha", "I Heinz", "C Keyur", "Shah"], "doi": null, "journal": null}
+{"title": "THE EARLY YEARS: Evaluating Montessori Education", "sha": "9a2cbe121508e2013c42a3d2b9878d0e05173b4b", "authors": ["A Lillard"], "doi": "10.1126/science.1132362", "journal": "Science"}
+{"title": "Model-Based Validation of QoS Properties of Biomedical Sensor Networks", "sha": "9abdc192d07adf4c07b677aac1ba0f71115c054a", "authors": ["Simon Tschirner", "Liang Xuedong", "Wang Yi"], "doi": null, "journal": null}
+{"title": "Caspase-dependent initiation of apoptosis and necrosis by the Fas receptor in lymphoid cells: onset of necrosis is associated with delayed ceramide increase", "sha": "9acee83f483196c813c5b9ec2e153c0e3e4d7b0b", "authors": ["C Hetz"], "doi": "10.1242/jcs.00153", "journal": "Journal of Cell Science"}
+{"title": "La Declaraci\u00f3n STROBE o c\u00f3mo mejorar la presentaci\u00f3n de los estudios observacionales", "sha": "9afe783bb2f56fa81f6d92dbdcdf0c04d07a16b1", "authors": [], "doi": null, "journal": null}
+{"title": "Chloride-Dependent Intracellular pH Regulation via Extracellular Calcium-Sensing Receptor in the Medullary Thick Ascending Limb of the Mouse Kidney", "sha": "9b19af750904e12167d89965ea3925bf959af3b9", "authors": ["A Maruyama", "Y Tsuchiya", "S Takahashi", "S", "Kondo", "Y"], "doi": null, "journal": "Tohoku J. Exp. Med"}
+{"title": "An Application of Genetic Algorithms t o Evolve Hopfield type Optimum Network Architectures for Object Extraction", "sha": "9b314a7bd7fae6a97e76d938fd778581ba444525", "authors": ["Susmita De", "Ashish Ghosli", "Sankar Pal"], "doi": null, "journal": null}
+{"title": "NBER WORKING PAPER SERIES A DUAL POLICY PARADOX: WHY HAVE TRADE AND IMMIGRATION POLICIES ALWAYS DIFFERED IN LABOR-SCARCE ECONOMIES?", "sha": "9b41dc3e2df7e805ced3595dceb90a283d6adbfc", "authors": ["Timothy Hatton", "Jeffrey Williamson", "Toke Aidt", "Pieter Bevelander", "Roger Bourque", "Don Devoretz", "Peter Lindert", "Jim Robinson", "Ken Sokoloff"], "doi": null, "journal": null}
+{"title": "Speed estimation with propagation maps", "sha": "9b61c06df88a993f50abcc6011fff54c1e8ab812", "authors": ["C Rasche"], "doi": "10.1016/j.neucom.2005.05.013", "journal": "Neurocomputing"}
+{"title": "Robust Phase-Correlation Based Registration of Airborne Videos Using Motion Estimation", "sha": "9c25585364c4d04fb57c92d112ce3b0818c328b4", "authors": ["Frank De Morsier", "Maurice Borgeaud", "Christoph K\u00fcchler", "Adrian Vogel", "Volker Gass", "Jean-Philippe Thiran"], "doi": "10.1007/978-3-642-32714-8_3", "journal": "Lecture Notes in Geoinformation and Cartography"}
+{"title": "CLOTTING ACTIVATION AND IMPAIRMENT OF FIBRINOLYSIS IN MALIGNANCY", "sha": "9c6af76859e0b1866f8fd6f68da1db6ca4ac78ef", "authors": ["E Rocha", "J Fernandez", "F Cuesta", "B Hernandez", "M", "Paloma", "M-J", "J"], "doi": null, "journal": "THROMBOSIS RESEARCH"}
+{"title": "Optically continuous silcrete quartz cements of the St. Peter Sandstone: High precision oxygen isotope analysis by ion microprobe", "sha": "9daa0206e43359a1cc4957e603fe4c1bd23f0acd", "authors": ["Jacque Kelly", "Bin Fu", "Noriko Kita", "John Valley"], "doi": "10.1016/j.gca.2007.05.014", "journal": "Geochimica et Cosmochimica Acta"}
+{"title": "Epidemiology and Prevention Prognostic Values of Clockwise and Counterclockwise Rotation for Cardiovascular Mortality in Japanese Subjects A 24-Year Follow-Up of the National Integrated Project for Prospective Observation of Noncommunicable Disease and Its Trends in the Aged, 1980-2004 (NIPPON DATA80)", "sha": "9dbab3f1a53a6766c0702e7c46decba8c3fbe8b4", "authors": ["Yasuyuki Nakamura", "Tomonori Okamura", "Aya Higashiyama", "Makoto Watanabe", "Aya Kadota", "Takayoshi Ohkubo", "Katsuyuki Miura", "Fumiyoshi Kasagi", "Kazunori Kodama", "Akira Okayama", "Hirotsugu Ueshima", ";"], "doi": null, "journal": null}
+{"title": "Neural correlates of behavior in the moth Manduca sexta in response to complex odors", "sha": "9e2ede65a1e0bf32c6fc9470d53fcc12a1b67bae", "authors": ["Jeffrey Riffell", "H Lei", "John Hildebrand"], "doi": null, "journal": null}
+{"title": "Environmental Chemistry Education in Europe: Setting the Agenda 1st Workshop organised by the Committee on Education in Environmental Chemistry", "sha": "9e61373b9a790999d1128c71e0c2350c27a97f1f", "authors": ["Uri Zoller"], "doi": null, "journal": null}
+{"title": "Dental Press Ortodon Ortop Facial 94 Maring\u00e1", "sha": "9e68fd7f7a666d02c40b13a2546c16f62cec18b6", "authors": ["Elionai Dias Soares", "Adriana Silva De Carvalho", "Jurandir Ant\u00f4nio Barbosa"], "doi": null, "journal": null}
+{"title": "Experimental testing and modelling of an industrial insulated pipeline for deep sea application", "sha": "9eb48fd750b113b0442e771a6f34ad79d5558353", "authors": ["Nad\u00e8ge Bouchonneau", "Val\u00e9rie Sauvant-Moynot", "Dominique Choqueuse", "Fran\u00e7ois Grosjean", "Emmanuel Poncet", "Dominique Perreux"], "doi": "10.1016/j.petrol.2010.03.023", "journal": "Journal of Petroleum Science and Engineering"}
+{"title": "Comparing Weighting Models for Monolingual Information Retrieval", "sha": "9ed309f4aec835264863d1ce9746fc7edf2c14d1", "authors": ["Gianni Amati", "Claudio Carpineto", "Giovanni Romano"], "doi": null, "journal": null}
+{"title": "Web Mining in Soft Computing Framework: Relevance, State of the Art and Future Directions", "sha": "9edaf8342f509de3013964b58fdecb6a3163db8a", "authors": ["Sankar Pal", "Pabitra Mitra"], "doi": null, "journal": "IEEE TRANSACTIONS ON NEURAL NETWORKS"}
+{"title": "Balance and gait in older electroconvulsive therapy recipients: a pilot study", "sha": "9f46700eea89690b68bf948b4eeb800c494ec39e", "authors": ["Chris Plakiotis", "Barson", "Vengadasalam", "Haines", "O&apos;connor"], "doi": "10.2147/ndt.s42628", "journal": "Neuropsychiatric Disease and Treatment"}
+{"title": "A Performance Study of Deployment Factors in Wireless Mesh Networks", "sha": "9f50c9285a1f484bf724964ed38c15712d02d4f1", "authors": ["Joshua Robinson", "Edward Knightly"], "doi": null, "journal": null}
+{"title": "QUO VADIS, BAYESIAN IDENTIFICATION?", "sha": "9f811b1265bc77fe113db3d11f8c8b4611c0c68b", "authors": ["Rudolf Kulhav\u00fd", "Petya Ivanova"], "doi": null, "journal": null}
+{"title": "Introducing a socio-technical perspective on business processes into Enterprise Interoperability Frameworks", "sha": "9f84b71a8bcacf2d57e61f78688c34c8137028bd", "authors": ["Charles Crick", "Eng Chew"], "doi": null, "journal": null}
+{"title": "Isomorphous Substitution in Bimetallic Oxide Clusters", "sha": "9f92901419db2ee4870590c2d14c2521e65ac878", "authors": ["E Janssens", "G Santambrogio", "M Br\u00fcmmer", "L W\u00f6ste", "P Lievens", "J Sauer", "G Meijer", "K Asmis"], "doi": "10.1103/physrevlett.96.233401", "journal": "Physical Review Letters"}
+{"title": "Bradford Scholars-how to deposit your paper Overview Copyright check", "sha": "9fe650ba76b24ef259a560ee4c9ced0cceaa5cc6", "authors": [], "doi": null, "journal": null}
+{"title": "Special Section of Neuropsychology Review on HIV/NeuroAIDS", "sha": "a00de21eb383e61bfa93613f6e88ff90704c80e0", "authors": ["Edith Sullivan"], "doi": "10.1007/s11065-009-9104-3", "journal": "Neuropsychology Review"}
+{"title": "Extraction of Antioxidants from Borage (Borago officinalis L.) Leaves\u2014Optimization by Response Surface Method and Application in Oil-in-Water Emulsions", "sha": "a03179f585ea92d01e04443c73edce7300d65456", "authors": ["Francisco Segovia", "Bryshila Lupo", "Sara Peir\u00f3", "Michael Gordon", "Mar\u00eda Almajano"], "doi": "10.3390/antiox3020339", "journal": "Antioxidants"}
+{"title": "A Service of zbw Leibniz-Informationszentrum Wirtschaft Leibniz Information Centre for Economics", "sha": "a03c7548abd22d4ba93260e4c2bfe297fa0488fe", "authors": ["Axel Dreher", "Pierre-Guillaume M\u00e9on", "Friedrich Schneider"], "doi": "10.3929/ethz-a-005503049", "journal": null}
+{"title": "\u00dd\u00d7\u00d7\u00d7\u00d2 \u00d2\u00d2\u00d0\u00dd\u00d7\u00d7\u00d7 \u00d3\u00d3 \u00cb\u00db\u00db\u00d8\u00d8\u00d2\u00d2 \u00ca\u00c0 \u00c5\u00d3\u00d3\u00d3\u00d0\u00d7 \u00cb\u00dd\u00d0\u00da\u00da\u00da \u00c3\u00c3\u00d9\u00d9\u00d1\u00d1\u00d2\u00d2 \u00cd\u00d2\u00d2\u00da\u00d6\u00d7\u00d7\u00d8\u00dd \u00d3\u00d3 \u00ce\u00ce\u00ce\u00d2\u00d2\u00d2\u00b8\u00b8\u00b8\u00d4\u00d4\u00d6\u00d8\u00d1\u00d1\u00d2\u00d8\u00ce\u00ce\u00ce\u00d2\u00d2\u00d2\u00b8\u00b8\u00b8\u00d4\u00d4\u00d6\u00d8\u00d1\u00d1\u00d2\u00d8 \u00d3\u00d3 \u00d3\u00d3\u00d3\u00d2\u00d3\u00d1\u00d1\u00d1\u00d7 \u00c0\u00d3\u00d3\u00d3\u00d2\u00d7\u00d8\u00d8\u00d9\u00d9\u00d9\u00d2\u00d2\u00d2\u00d7\u00d7\u00d7\u00b8\u00bd\u00bc\u00bd\u00bc\u00c0\u00d3\u00d3\u00d3\u00d2\u00d7\u00d8\u00d8\u00d9\u00d9\u00d9\u00d2\u00d2\u00d2\u00d7\u00d7\u00d7\u00b8\u00c0\u00d3\u00d3\u00d3\u00d2\u00d7\u00d8\u00d8\u00d9\u00d9\u00d9\u00d2\u00d2\u00d2\u00d7\u00d7\u00d7\u00b8\u00bd\u00bc\u00bd\u00bc \u00ce\u00ce\u00ce\u00d2\u00d2\u00d2 \u00d9\u00d7\u00d8\u00d6\u00d6\u00d6 \u00b4\u00b4\u00d1\u00d1\u00d1\u00d0\u00d0 \u00d7\u00dd\u00d0\u00da\u00da\u00da\u00ba\u00ba\u00d9\u00d9\u00d1\u00d1\u00d2\u00d2\u00d2\u00d9\u00d2\u00d2\u00da\u00da\u00da\u00ba\u00ba\u00ba\u00ba\u00ba\u00d8\u00b5 \u00d2\u00d2 \u00cb\u00dd\u00d0\u00da\u00da\u00da \u00d6\u00d6\u00d6\u00db\u00db\u00d6\u00d8\u00d8\u00b9\u00cb\u00cb\u00d2\u00d2\u00d8\u00d8\u00d8\u00d6 \u00bd \u00ce\u00ce\u00ce\u00d2\u00d2\u00d2 \u00cd\u00d2\u00d2\u00da\u00d6\u00d7\u00d7\u00d8\u00dd \u00d3\u00d3 \u00d3\u00d3\u00d3\u00d2\u00d3\u00d1\u00d1\u00d1\u00d7 \u00d7\u00d2\u00d2 \u00d9\u00d7\u00d7\u00d2\u00d2\u00d7\u00d7 \u00d1\u00d1\u00d2\u00d2\u00d7\u00d8\u00d6\u00d6\u00d8\u00d8\u00d3\u00d2 \u00d4\u00d4\u00d6\u00d8\u00d1\u00d1\u00d2\u00d8 \u00d3\u00d3 \u00cb\u00d8\u00d8\u00d8\u00d8\u00d7\u00d8\u00d8\u00d8\u00d7 \u00d9\u00d9\u00d9\u00d7\u00d7\u00d7 \u00be\u00b9\u00b9\u00b8\u00bd\u00bc\u00bc\u00bc\u00be\u00b9\u00b9\u00b8\u00bd\u00bc\u00bc\u00bc \u00ce\u00ce\u00ce\u00d2\u00d2\u00d2 \u00d9\u00d7\u00d8\u00d6\u00d6\u00d6 \u00b4\u00b4\u00d1\u00d1\u00d1\u00d0\u00d0 \u00d7\u00d7\u00d6\u00d9\u00d9\u00d9\u00db\u00db\u00db\u00db\u00d7\u00d7\u00d7\u00ba\u00db\u00d9\u00b9\u00db\u00db\u00db\u00d2\u00ba\u00ba\u00ba\u00ba\u00ba\u00d8\u00b5 \u00bd \u00d3\u00d6\u00d6\u00d6\u00d7\u00d4\u00d3\u00d2\u00d2\u00d2\u00d2\u00d2 \u00d2\u00d9\u00d8\u00d8\u00d3\u00d6 \u00bd", "sha": "a055651331084dd2688846279a1075a12ef39f79", "authors": [], "doi": null, "journal": null}
+{"title": "Orthodontics in 3 millennia. Chapter 3: The professionalization of orthodontics", "sha": "a05cc1d6546f5db7b93623f3325262f90bc7a534", "authors": ["Norman Sequim", "Wash"], "doi": "10.1016/j.ajodo.2005.04.001", "journal": "American Journal of Orthodontics and Dentofacial Orthopedics"}
+{"title": "Primary leiomyoma of the liver: accurate preoperative diagnosis on liver biopsy", "sha": "a0e67e8f9052b62091f1e53cb63d777be3215d18", "authors": ["H Sousa", "F Portela", "L Semedo", "E Furtado", "C Marinho", "M Cipriano", "M Leitao"], "doi": "10.1136/bcr.09.2008.0898", "journal": "Case Reports"}
+{"title": "Using GOMS for User Interface Design and Evaluation: Which Technique?", "sha": "a0f4aaabb8c159b11b98418bb704d612cb601d5a", "authors": ["Bonnie John", "David Kieras"], "doi": null, "journal": null}
+{"title": "Optimally Maximizing Iteration-Level Loop Parallelism", "sha": "a1964dbe96c70a3c7b47a508eea4bd8121df8915", "authors": ["Duo Liu", "Yi Wang", "Zili Shao", "Minyi Guo", "Jingling Xue"], "doi": "10.1109/tpds.2011.171", "journal": "IEEE Transactions on Parallel and Distributed Systems"}
+{"title": "Correlates of Self-Rated Successful Aging Among Community-Dwelling Older Adults", "sha": "a1f3e091595770fb47b610bb58fd4ada90434431", "authors": ["Lori Montross", "Colin Depp", "John Daly", "Jennifer Reichstadt", "Shahrokh Golshan", "David Moore", "David Sitzer", "V Jeste"], "doi": null, "journal": null}
+{"title": "Effect of patient positioning on the duration of venous reflux in duplex ultrasound for venous insufficiency", "sha": "a1fb3878079ccb07e6b5ce8503e5dbe12fa6fe97", "authors": ["M Bonfield", "F Cramp", "T Robinson"], "doi": "10.1258/ult.2012.011055", "journal": "Ultrasound"}
+{"title": "Evaluating the Failures of Data Centers in Cloud Computing", "sha": "a23c64ab75e81523f92d417c8d2cab7996d7a3a1", "authors": ["Preeti Gupta", "Chaahat Gupta"], "doi": null, "journal": "International Journal of Computer Applications"}
+{"title": "Indiana Libraries", "sha": "a2636ad5f98a083b80a430e069ec7280def50e04", "authors": [], "doi": null, "journal": null}
+{"title": "Interferometers for Displacement-Noise-Free Gravitational-Wave Detection", "sha": "a3603cc9e5824f8351c91756188905695cd8753e", "authors": ["Yanbei Chen", "Archana Pai", "Kentaro Somiya", "Seiji Kawamura", "Shuichi Sato", "Keiko Kokeyama", "Robert Ward", "Keisuke Goda", "Eugeniy Mikhailov"], "doi": "10.1103/physrevlett.97.151103", "journal": "Physical Review Letters"}
+{"title": "OCIS codes: 160.3918, 160.4670, 260.5740", "sha": "a3608f1c7182c52a1850cbed8f567c582dfdd9c0", "authors": ["Kamil Alici", "Ekmel Ozbay", "K Alici", "E Ozbay"], "doi": null, "journal": "J. Opt. Soc. Am. B"}
+{"title": "Stable Policies for Petri-Nets with Fluctuating Transition Processes", "sha": "a399346ed95116a34d2c894235a2e6df10fcd1d3", "authors": ["Costas Courcoubetis", "Richard Weber"], "doi": null, "journal": "Proceedings of the 29th Conference on Decision and Conlrol Honolulu"}
+{"title": "Holdup, Search and Inefficiency Shingo Ishiguro", "sha": "a3d2c453f39527891470ae3b9caf246a90b90e62", "authors": [], "doi": null, "journal": null}
+{"title": "Does Benford's law hold in economic research and forecasting?", "sha": "a3d468938790944bd9b4a8cfbe102ce4b972cfa7", "authors": ["Stefan G\u00fcnnel", "Karl-Heinz T\u00f6dter"], "doi": null, "journal": null}
+{"title": "Atomic Layer Deposition on Biological Macromolecules: Metal Oxide Coating of Tobacco Mosaic Virus and Ferritin", "sha": "a44188e8c7ea7f5506ed5d2ad686cc88f07fc898", "authors": ["Mato Knez", "Anan Kadri", "Christina Wege", "Ulrich G\u00f6sele", "Holger Jeske", "Kornelius Nielsch"], "doi": "10.1021/nl060413j", "journal": "Nano Letters"}
+{"title": "Introduction: Technology Partnerships: Marriage of Convenience or Full-Fledged Collabo-ration?", "sha": "a460afdd70a5a3d1df7625644a14928759281979", "authors": ["Sara Laughlin", "Sam Laughlin", "Associates"], "doi": null, "journal": null}
+{"title": "Design and implementation of a mobile database for Java phones", "sha": "a47b6b2f94e27321a60d5216b50002a6a86a0279", "authors": ["Eric Jui-Lin Lu", "Yung-Yuan Cheng"], "doi": "10.1016/j.csi.2003.12.003", "journal": "Computer Standards & Interfaces"}
+{"title": "Numerische Mathematik Symplectic phase flow approximation for the numerical integration of canonical systems", "sha": "a524d479ef5f45bba366c6e56bd8233ab81cc6f1", "authors": ["S Miesbach", "H Pesch"], "doi": null, "journal": "Numer. Math"}
+{"title": "FPGA Implementable Architecture for Geometric Global Positioning", "sha": "a578193e7aa33fdabf221061c68025844bde0599", "authors": ["Anant Utgikar", "Guna Seetharaman"], "doi": null, "journal": null}
+{"title": "Fractal: A mobile code-based framework for dynamic application protocol adaptation", "sha": "a5bee2b4afb483518f392f2671bb95142ebeb8ff", "authors": ["H Lufei", "W Shi"], "doi": "10.1016/j.jpdc.2006.03.004", "journal": "Journal of Parallel and Distributed Computing"}
+{"title": "Time-Varying Harmonics: Part II-Harmonic Summation and Propagation", "sha": "a5f5c65cd8f066efb06ddd8cb6876258dfb05464", "authors": ["Y Baghzouz", "R Burch", "A Capasso", "A Cavallini", "A Emanuel", "M Halpin", "R Langella", "G Montanari", "K Olejniczak", "P Ribeiro", "S Rios-Marcuello", "F Ruggiero", "R Thallam", "A Testa", "P Verde"], "doi": null, "journal": "IEEE TRANSACTIONS ON POWER SYSTEMS"}
+{"title": "Reconstructing Optical Flow Fields by Motion Inpainting", "sha": "a621af9501677022a01965396fc0b21f63c93f7a", "authors": ["Benjamin Berkels", "Claudia Kondermann", "Christoph Garbe", "Martin Rumpf"], "doi": null, "journal": null}
+{"title": "Hugh LaFollette: The Practice of Ethics", "sha": "a65aa4239ed749f4fca3f629c2c001e792a66514", "authors": ["Alex Voorhoeve"], "doi": "10.1007/s00355-009-0414-4", "journal": "Social Choice and Welfare"}
+{"title": "The expressivist objection to prenatal diagnosis: can it be laid to rest?", "sha": "a6b6f9f213695d4ed741c2aa4b2b3de4d88353be", "authors": ["S Holm", "S\u00f8ren Holm"], "doi": "10.1136/jme.2006.019984", "journal": "Journal of Medical Ethics"}
+{"title": "Interactive Super-resolution through Neighbor Embedding", "sha": "a6d5812b8e9603a524a7af90312bc633cbf47a41", "authors": ["Jian Pu", "Junping Zhang", "Peihong Guo", "Xiaoru Yuan"], "doi": null, "journal": null}
+{"title": "Quantum-classical modeling of photoisomerization of polyatomic molecules A model Hamiltonian to simulate the complex photochemistry of benzene II Molecular dynamics simulation with an ab initio potential energy function and finite element interpolation: The photoisomerization of cis-stilbene in solution Quantum-classical modeling of photoisomerization of polyatomic molecules", "sha": "a7071dee344aa04a8dab6cb5112333a7356beaca", "authors": ["D Tranca", "A Neufeld", "D Tranca", "A Neufeld"], "doi": null, "journal": "Citation: The Journal of Chemical Physics"}
+{"title": "Interactive effects of proactive personality and display rules on emotional labor in organizations", "sha": "a72df616d77469f0c13ee738622ddf71ed77902e", "authors": ["Kristen Randolph", "Jason Dahling"], "doi": "10.1111/jasp.12184", "journal": "Journal of Applied Social Psychology"}
+{"title": "Structural modeling of contrast sensitivity in adulthood", "sha": "a75a17f2539deb121f554dcc5bddea499451ae05", "authors": ["Charles Scialfa", "Donald Kline", "Philip Wood"], "doi": null, "journal": null}
+{"title": "Continuing Enginecring Education Program", "sha": "a7968e86589676188d45ddac1bf84b96e8dd2b78", "authors": ["Cambridge England", "J Davies", "R Graglia", "R Mittra", ") Wilton", "Dayics"], "doi": null, "journal": "ADVANCED DEVELOPMENTS IN RADAR 1-7"}
+{"title": "OBSERVATIONS U-Shaped Association Between White Blood Cell Count and Fasting Plasma Glucose Level", "sha": "a83d363dea48ba9242d40ab1fe9ad0a82a55d41c", "authors": ["Koji Tamakoshi", "Hiroshi Yatsuya", "Takaaki Kondo", "Yoko Hori", "Huiming Zhang", "Miyuki Ishikawa", "Chiyoe Murata", "Rei Otsuka", "Shankuan Zhu", "Hideaki Toyoshima", "Luke&apos;", "/ Roosevelt"], "doi": null, "journal": null}
+{"title": "Magnetoelectric effects in single crystals of the cubic ferrimagnetic helimagnet Cu 2 OSeO 3", "sha": "a85536b544ab227371da3151c853b3bad13ec017", "authors": ["M Belesi", "I Rousochatzakis", "M Abid", "U R\u00f6\u00dfler", "H Berger", "J.-Ph Ansermet"], "doi": null, "journal": null}
+{"title": "Liquid biopsies in patients with diffuse glioma", "sha": "a89b878ffdd46b8fff31f379f6ceab6f8164dc36", "authors": ["Citation Best", "G Myron", "Nik Sol", "Sebastiaan Zijl", "Jaap Reijneveld", "Pieter Wesseling", "Thomas Wurdinger"], "doi": null, "journal": "Acta Neuropathologica"}
+{"title": "Degradation of 2,4-dichlorophenol and coupling into humic matter by oxidative biomimetic catalysis with iron-porphyrin", "sha": "a8a47f493c5b7bacda6f4eb814e7158688e5bf41", "authors": ["Barbara Fontaine", "Assunta Nuzzo", "Riccardo Spaccini", "Alessandro Piccolo"], "doi": "10.1016/j.gexplo.2012.11.003", "journal": "Journal of Geochemical Exploration"}
+{"title": "OBSERVATIONS Remitting Diabetes A new genetic subgroup?", "sha": "a9fa613b32d5fa892f19d529e31a649bba9153cf", "authors": [], "doi": null, "journal": null}
+{"title": "Distributed Flames in Type Ia Supernovae", "sha": "aa403b2887b0ef400f382bd3a5ea672b4aee5e8c", "authors": ["A Aspden", "J Bell", "S Woosley"], "doi": null, "journal": null}
+{"title": "Chromosomal bar codes produced by multicolor fluorescence in situ hybridization with multiple YAC clones and whole chromosome painting probes", "sha": "ab022189079f98937960858c34c9ae9a34de5786", "authors": ["Christoph Lengauer", "Michael Speicher", "Susanne Popp", "Anna Jauch", "Masafumi Taniwaki", "Ramaiah Nagaraja", "Harold Rietnman", "Helen Donis-Keller", "Michele D&apos;urso", "David Schlessinger", "Thomas Cremer"], "doi": null, "journal": "Human Molecular Genetics"}
+{"title": "NBER WORKING PAPER SERIES DO STUDENTS CARE ABOUT SCHOOL QUALITY? DETERMINANTS OF DROPOUT BEHAVIOR IN DEVELOPING COUNTRIES", "sha": "ab3d4956579e57df1714894243c70210fbbaa833", "authors": ["Eric Hanushek", "Victor Lavy", "Kohtaro Hitomi", "Joshua Angrist", "Mark Bils", "Bruce Chapman", "Paul Chen", "Mark Harrison", "Elizabeth King", "Emmanuel Jimenez", "Michele Tertilt", "Martin Zelder"], "doi": null, "journal": null}
+{"title": "Implementation of the Local Reference Node Concept for Spatially Distributed Circuits", "sha": "ab5217a9b9813972098b3928b5e3ed49d2118117", "authors": ["Carlos Christoffersen", "Michael Steer"], "doi": null, "journal": null}
+{"title": "Aggregation of Frenkel defects under irradiation: a mesoscopic approach", "sha": "abe1b49d7f715627d3f7c422255991a2273c64eb", "authors": ["W Soppe", "E Kotornin"], "doi": null, "journal": "NNhl B Nuclear Instruments and Methods in Physics Research B"}
+{"title": "Breakdown of Fermi liquid behavior at the(\u03c0,\u03c0)=2kFspin-density wave quantum-critical point: The case of electron-doped cuprates", "sha": "ac4cf4f5412b3a61b590f6643acbdd3b901c8b47", "authors": ["Dominic Bergeron", "Debanjan Chowdhury", "Matthias Punk", "Subir Sachdev", "A Tremblay"], "doi": "10.1103/physrevb.86.155123", "journal": "Physical Review B"}
+{"title": "Timing of Prenatal Stressors and Autism", "sha": "ac59503663e07f6a1747647d7dd78d56d40c4e9c", "authors": ["D Beversdorf", "S Manning", "A Hillier", "S Anderson", "R Nordgren", "S Walters", "H Nagaraja", "W Cooley", "S Gaelic", "M Bauman"], "doi": "10.1007/s10803-005-5037-8", "journal": "Journal of Autism and Developmental Disorders"}
+{"title": "Do the elderly reduce housing equity? An international comparison", "sha": "ac9b769ae96fd7e7995ca3e93f92ca1f52a2df0e", "authors": ["Maria Chiuri", "Tullio Jappelli"], "doi": null, "journal": null}
+{"title": "Design of Energy-Saving Algorithms for Hybrid Fiber Coaxial Networks Based on the DOCSIS 30 Standard", "sha": "acac3aae8cba2743b2dc6e1708a2a98daa96eea3", "authors": ["Zuqing Zhu"], "doi": "10.1364/jocn.4.000449", "journal": "Journal of Optical Communications and Networking"}
+{"title": "Short-term degradation behaviors of light emitting diodes made of polyurethane derivative with large permanent dipoles on the side chain", "sha": "acaef51369c7f468f4e600b0661f5e4b7b7a03f7", "authors": ["Hyein Jeong", "Dechun Zou", "Tetsuo Tsutsui", "Chang-Sik Ha"], "doi": null, "journal": null}
+{"title": "Integrated Capacitors for Conductive Lithographic Film Circuits", "sha": "acf4eeed48a21ef7f3947317d7109ccff8fdb5e5", "authors": ["Paul Harrey", "Peter Evans", "David Harrison"], "doi": null, "journal": "IEEE TRANSACTIONS ON ELECTRONICS PACKAGING MANUFACTURING"}
+{"title": "Relato de experi\u00eancia-369-Texto Contexto Enferm", "sha": "acffcd37ed4aec736bbcc2ec15ed85be33a6dbad", "authors": ["Alacoque Erdmann", "Marli Terezinha", "Stein Backes", "Dirce Stein Backes", "Magda Koerich", "Maria Baggio", "Jacira Carvalho", "Betina H\u00f6rner", "Schlindwein Meirelles"], "doi": null, "journal": null}
+{"title": "Maximum Unfolded Embedding: Formulation, Solution, and Application for Image Clustering", "sha": "ad0cb9c59d923f08848fc34e42b9a1be3002e952", "authors": ["Huan Wang", "Shuicheng Yan", "Thomas Huang", "Xiaoou Tang"], "doi": null, "journal": null}
+{"title": "Reliability and bifurcation in neurons driven by multiple sinusoids", "sha": "ad0d26a13361c8481d8ae842b8d71bd8051dd95f", "authors": ["Peter Thomas", "Paul Tiesinga", "Jean-Marc Fellous", "Terrence Sejnowski"], "doi": "10.1016/s0925-2312(02)00856-1", "journal": "Neurocomputing"}
+{"title": "Program Chairs' Introduction to the 2004 IEEE Workshop on Real-Time Vision for Human-Computer Interaction at the 2004 IEEE CVPR Conference", "sha": "ad3e2fe6747bee580ae4c71af07ebc975c3cd041", "authors": ["Washington", "D Branislav", "Kisa\u010danin Vladimir", "Pavlovi\u00b4c Pavlovi\u00b4c", "Thomas Huang", "Advanced Engineering"], "doi": null, "journal": null}
+{"title": "Applications of Staffing, Scheduling, Budgeting Methodologies to Hospital Ancillary Units", "sha": "ada7455f31f400af53a2ff3e7b43a34f118b6030", "authors": ["Myeng-Ki Kirn", "Walton Hancock"], "doi": null, "journal": "Journal of Medical Systems"}
+{"title": "Does Political Instability lead to higher and more volatile inflation? A Panel Data Analysis", "sha": "adf0463c03c4ed0202f8d2f2aedeaaa3f966faab", "authors": ["Ari Aisen", "Francisco Veiga"], "doi": null, "journal": null}
+{"title": "Sense of coherence as a mediator of health-related quality of life dimensions in patients with breast cancer: a longitudinal study with prospective design", "sha": "ae0fbfda838057082daa881f607dea76e8101b11", "authors": ["Camelia Rohani", "Heidar-Ali Abedi", "Kay Sundberg", "Ann Langius-Ekl\u00f6f"], "doi": "10.1186/s12955-015-0392-4", "journal": "Health and Quality of Life Outcomes"}
+{"title": "Did Trade Liberalization Help Women? The Case of Mexico in the 1990s", "sha": "ae92c936a730d1915d9c254c5c1bc4c1c690d53e", "authors": ["Ernesto Aguayo-Tellez", "Jim Airola", "Chinhui Juhn", "Carolina Villegas-Sanchez", "Chinhui Juhn"], "doi": null, "journal": null}
+{"title": "Aloca\u00e7\u00e3o de modelos de produtos a equipes de trabalhadores baseada em modelos de curvas de aprendizagem Assignment of product models to worker teams using learning curve models", "sha": "af09e20b898ee6b048d427ffa41a9ae66986ff11", "authors": [], "doi": null, "journal": null}
+{"title": "Redox cycling in nanoporous electrochemical devices", "sha": "af3c9251d4a51f90d3c7fa36ea87d7f9104a441d", "authors": ["Martin H\u00fcske", "Regina Stockmann", "Andreas Offenh\u00e4usser", "Bernhard Wolfrum"], "doi": "10.1039/c3nr03818a", "journal": "Nanoscale"}
+{"title": "Optimisation of Relative Quantitative RT-PCR for Expression Analysis in Azalea Flower Colour Sports", "sha": "af489ab8ee306a093e928ff68ac38018a598ff74", "authors": ["E De Keyser", "J De Riek", "E Van Bockstaele"], "doi": null, "journal": null}
+{"title": "Bio-butanol production from glycerol with Clostridium pasteurianum CH4: the effects of butyrate addition and in situ butanol removal via membrane distillation", "sha": "af57a39fb1edcb774757d4abaadb6fe2e5c2e966", "authors": ["De-Shun Lin", "Hong-Wei Yen", "Wei-Chen Kao", "Chieh-Lun Cheng", "Wen-Ming Chen", "Chieh-Chen Huang", "Jo-Shu Chang"], "doi": "10.1186/s13068-015-0352-6", "journal": "Biotechnology for Biofuels"}
+{"title": "Protein structure and dynamics at high pressure 1", "sha": "afe349731681d0956ea31a5b4eb4a6efc77afdd2", "authors": ["K Heremans", "L Smeller"], "doi": null, "journal": null}
+{"title": "Seasonal and Geographic Patterns in Tanning Using Real-Time Data From Google Trends", "sha": "b026f1a599b6c5b3d7c5958cb0377ea59e207545", "authors": ["Ashley Day", "Jerod Stapleton", "Sharon Manne", "Kristina Tatum", "James Goydos", "Elliot Coups"], "doi": null, "journal": "J Med Internet Res"}
+{"title": "CLARKE CRITICAL VALUES OF SUBANALYTIC LIPSCHITZ CONTINUOUS FUNCTIONS", "sha": "b038f267e7fd92dc2bdb34b31c469530e90e5ab7", "authors": ["J Er\u02c6omeer\u02c6", "Er\u02c6ome Bolte", "Aris Daniilidis", "Adrian Lewis", "Masahiro Shiota"], "doi": null, "journal": null}
+{"title": "Shanghai rising: health improvements as measured by avoidable mortality since 2000", "sha": "b0a3aaa36bb1b1b7f33122d448966f71a3737021", "authors": ["Michael Gusmano", "Victor Rodwin", "Chunfang Wang", "Daniel Weisz", "Li Luo", "Fu Hua"], "doi": "10.15171/ijhpm.2015.07", "journal": "International Journal of Health Policy and Management"}
+{"title": "Pricing climate risk mitigation", "sha": "b0c124b5edc9d6f009bfe0d7544528ca29943670", "authors": ["Joseph Aldy"], "doi": "10.1038/nclimate2540", "journal": "Nature Climate Change"}
+{"title": "Title Organic light-emitting devices", "sha": "b126ff8522576dea61acd8ad80f68a543e0e41c8", "authors": [], "doi": null, "journal": null}
+{"title": "RECURRENT CRITICAL POINTS AND TYPICAL LIMIT SETS OF RATIONAL MAPS", "sha": "b132253a7e380237879a9437d9f40dcb9a12b338", "authors": ["Alexander Blokh", "John Mayer", "Lex Oversteegen"], "doi": null, "journal": "PROCEEDINGS OF THE AMERICAN MATHEMATICAL SOCIETY"}
+{"title": "Cytosolic tail sequences and subunit interactions are critical for synaptic localization of glutamate receptors", "sha": "b16a745154dc249c421345eebc329f82d4168af8", "authors": ["Null- Chang"], "doi": "10.1242/jcs.02320", "journal": "Journal of Cell Science"}
+{"title": "Neuroendocrine mechanisms controlling female puberty: new approaches, new concepts", "sha": "b1f689952cb2f4d02a6a5dbdefd7180533333e67", "authors": ["Sergio Ojeda", "Christian Roth", "Alison Mungenast", "Sabine Heger", "Claudio Mastronardi", "Anne-Simone Parent", "Alejandro Lomniczi", "Heike Jung"], "doi": "10.1111/j.1365-2605.2005.00619.x", "journal": "International Journal of Andrology"}
+{"title": "Atmospheric Precorrected Differential Absorption Technique to Retrieve Columnar Water Vapor", "sha": "b205ef2e6ac9dd5b730c3249bfd867703268cd1e", "authors": ["Daniel Schl\u00e4", "Christoph Borel", "Johannes Keller", "Klaus Itten"], "doi": null, "journal": null}
+{"title": "Closing the circadian negative feedback loop: FRQ-dependent clearance of WC-1 from the nucleus Supplementary Material", "sha": "b268b2ffaebf154b6b64fb98d68ea246cb292eb9", "authors": ["Christian Hong", "Peter Ruoff", "Jennifer Loros", "Jay Dunlap"], "doi": null, "journal": null}
+{"title": "Manipulation of DNA Origami Nanotubes in Liquid using a Programmable Tapping Mode AFM", "sha": "b27b7df220089b2f9666025071127409b4149c03", "authors": ["Longhai Li", "Xiaojun Tian", "Zaili Dong", "Lianqing Liu", "Osamu Tabata", "Wen Li"], "doi": null, "journal": null}
+{"title": "Construction of a Semantic Model for a Typed Assembly Language", "sha": "b287f3d8b7bb272962d6771c227fc690cf28402a", "authors": ["Gang Tan", "Andrew Appel", "Kedar Swadi", "Dinghao Wu"], "doi": null, "journal": null}
+{"title": "A BIT OF MATHEMATICAL HISTORY. A BIT OF MATHEMATICAL HISTOEY", "sha": "b2a2e2dc16de9c139f59c7e724959ddb60d77a54", "authors": ["B Db", "B\u00f4chbr Maxime"], "doi": null, "journal": null}
+{"title": "Limbic system mechanisms of stress regulation: Hypothalamo-pituitary-adrenocortical axis", "sha": "b2b806e5639172511d407d179eb2323aa7ad2de0", "authors": ["James Herman", "Michelle Ostrander", "Nancy Mueller", "Helmer Figueiredo"], "doi": "10.1016/j.pnpbp.2005.08.006", "journal": "Progress in Neuro-Psychopharmacology and Biological Psychiatry"}
+{"title": "Preface to the EURO Journal on Decision Processes", "sha": "b32a8e20bdaba0675c678fdb5be3ca8a28d5c4da", "authors": ["Ahti Salo", "Marja Makarow"], "doi": "10.1007/s40070-013-0009-2", "journal": "EURO Journal on Decision Processes"}
+{"title": "CONTRIBUI\u00c7\u00c3O AO CONHECIMENTO DA FAUNA DE PEIXES DO LITORAL NORTE DO ESTADO DE S\u00c3O PAULO*", "sha": "b3a73c77f738d84acfd2efc76d604f17a12fcd68", "authors": ["Oceanogr Bolm Inst", "Paulo"], "doi": null, "journal": null}
+{"title": "Incorporation of peptides in phospholipid aggregates using ultrasound", "sha": "b46047fe7ccbf2dcda2f63ece7d342a780cb6845", "authors": ["Raquel Silva", "Collin Little", "Helena Ferreira", "Artur Cavaco-Paulo"], "doi": "10.1016/j.ultsonch.2008.03.010", "journal": "Ultrasonics Sonochemistry"}
+{"title": "Designing a Regulatory and Supervisory Framework for Integrated Financial Markets Designing a regulatory and supervisory framework for integrated financial markets", "sha": "b495000629d9bd73978e7d740eac2450e0b36430", "authors": ["Giorgio Giorgio", "Carmine Noia", "Guido", "Giorgio Giorgio", "Guido Carli", "Viale Pola", "Roma"], "doi": null, "journal": null}
+{"title": "An Empirical Analysis of Forecast Sharing in the Semiconductor Equipment Supply Chain", "sha": "b4ffc1b64f875211656c8211fa3bd409182d06be", "authors": ["Christian Terwiesch", "Z Ren", "Teck Ho", "Morris Cohen"], "doi": null, "journal": null}
+{"title": "Body Odor Quality Predicts Behavioral Attractiveness in Humans", "sha": "b509d2cd399e39bef36349dd6437b54ac0c9d623", "authors": ["S Roberts", "Alexandra Kralevich", "Camille Ferdenzi", "Tamsin Saxton", "Benedict Jones", "Lisa Debruine", "Anthony Little", "Jan Havlicek"], "doi": "10.1007/s10508-011-9803-8", "journal": "Archives of Sexual Behavior"}
+{"title": "DMT-An integrated disaster management tool", "sha": "b52c9e9a0fd0e044e8c8db4f9875e840ed619d60", "authors": ["M Angermann", "M Khider", "M Frassl", "M Lichtenstern"], "doi": null, "journal": null}
+{"title": null, "sha": "b566252f19c82f90196d9e420dc05a52819497f9", "authors": ["Hiroko Hiraoka", "Yojiro Shimada", "Yoshimi Sakata", "Malaika Watanabe", "Kazuhito Itamoto", "Masaru Okuda", "Toshiyuki Masuzawa", "Hisashi Inokuma", "H Inokuma"], "doi": null, "journal": null}
+{"title": "Fuzzy logic speed control of an induction motor", "sha": "b5d4da316000865c5ece666d6504c583c8528bc8", "authors": ["Jaime Fonseca", "Jo\u00e3o Afonso", "J\u00falio Martins", "Carlos Couto"], "doi": null, "journal": null}
+{"title": "Precedent and Progress of an Idea: Quadruple Building Block and the Schindler Shelter", "sha": "b695c68a67b6767dfe04c7d004726860f7134675", "authors": ["Jin-Ho Park", "Jin-Ho Park"], "doi": null, "journal": null}
+{"title": "Absence of Functional Inducible NO Synthase Enhances the Efficacy of Tolerance Induced by High Dose Antigen Feeding", "sha": "b69cab3c484a5dbfb145d13f650b0a9e33f635c9", "authors": ["D Kahn", "D Archer", "C Kelly"], "doi": "10.4049/jimmunol.165.11.6116", "journal": "The Journal of Immunology"}
+{"title": "High-Resolution Electron Microscopic Study on Atomic Arrangements at Growing Tips of Martensite Plates and a Nucleating Martensite in Fe-Ni-Mn and Fe-Cr-C Alloys", "sha": "b76dddd179920b0a64e4b6a2f4c0421f393546ec", "authors": ["* Ogawa", "S Kajiwara"], "doi": "10.2320/matertrans.48.860]", "journal": null}
+{"title": "ORACLE: Mobility control in wireless sensor and actor networks", "sha": "b79d697849372094f8540e22a096f4764fe6f984", "authors": ["Kaoru Ota", "Mianxiong Dong", "Zixue Cheng", "Junbo Wang", "Xu Li", "Xuemin Shen"], "doi": "10.1016/j.comcom.2011.08.008", "journal": "Computer Communications"}
+{"title": "Mechanism for Designing Metallic Metamaterials with a High Index of Refraction", "sha": "b8885187ab73a8c595156e210d945cdf2f409498", "authors": ["J Shen", "Peter Catrysse", "Shanhui Fan"], "doi": "10.1103/physrevlett.94.197401", "journal": "Physical Review Letters"}
+{"title": "Thermal Expansion and Glass Transition Temperatures of Y-Mg-Si-AI-0-N Glasses", "sha": "b8a004e0225186c2c0d236f062b42948f7aa142a", "authors": ["Irene Peterson", "Tseng-Ying Tien"], "doi": null, "journal": null}
+{"title": "Note on semiclassical uncertainty relations", "sha": "b8e6f5e99bf750c97154b3d9205d38212958f144", "authors": ["F Olivares", "F Pennini", "G Ferri", "A Plastino"], "doi": null, "journal": "Brazilian Journal of Physics"}
+{"title": "A Generalized Processor Mapping Technique for Array Redistribution", "sha": "b90415f866733b633ca6447cc116b2ff7edf7056", "authors": ["Ching-Hsien Hsu", "Yeh-Ching Chung", "Don-Lin Yang", "Chyi-Ren Dow"], "doi": null, "journal": null}
+{"title": "COMPLEX GENETICS AND IMPLICATIONS FOR PSYCHIATRY Basic Concepts in the Study of Diseases with Complex Genetics", "sha": "b904dd12215b01742e6a6afdca2f8f00c0432d08", "authors": ["Margit Burmeister"], "doi": null, "journal": null}
+{"title": null, "sha": "b90ee4df3da8b7d6aa6e4365dbeb5288a193664b", "authors": [], "doi": null, "journal": null}
+{"title": "Access to prenatal care: assessment of the adequacy of different indices", "sha": "b9381fa2aeeea9381ef0fae058951fb836f11aca", "authors": ["Edson Santos Neto", "Adauto Oliveira", "Eliana Zandonade", "Maria Leal"], "doi": "10.1590/0102-311x00125612", "journal": "Cadernos de Sa\u00fade P\u00fablica"}
+{"title": null, "sha": "b93e7cb3bcb946f40938837442ec04f333ce5c27", "authors": ["Raymond Garver", "October"], "doi": null, "journal": null}
+{"title": "Precisely Controlled Smart Polymer Scaffold for Nanoscale Manipulation of Biomolecules", "sha": "b98642a157a36a75f928e3a14b896f0dc0d2e3d2", "authors": ["Philipp Spuhler", "Laura Sola", "Xirui Zhang", "Margo Monroe", "Joseph Greenspun", "Marcella Chiari", "M \u00dcnl\u00fc"], "doi": "10.1021/ac3018263", "journal": "Analytical Chemistry"}
+{"title": "AN ANALYSIS OF A P P R O X I M A T I O N S FOR MAXIMIZING S U B M O D U L A R SET F U N C T I O N S-I", "sha": "b9e43395663f74c581982e9ca97a0d7057a0008c", "authors": ["G N E M H A U S E R", "L"], "doi": null, "journal": "Mathematical Programming"}
+{"title": null, "sha": "b9f481f1084cb7b31a5fd3127a3c76c1e22003ac", "authors": [], "doi": null, "journal": null}
+{"title": "KAP1 depletion increases PML nuclear body number in concert with ultrastructural changes in chromatin", "sha": "ba6c9216f35c47eb7e017d9628987fa6f71e8c89", "authors": ["Rosemarie Kepkay", "Kathleen Attwood", "Yael Ziv", "Yosef Shiloh", "Graham Dellaire"], "doi": "10.4161/cc.10.2.14551", "journal": "Cell Cycle"}
+{"title": "Surgical a I Radiolog,c Anatomy Journal of Clinical Anatomy The medial and inferior calcaneal nerves: an anatomic study", "sha": "ba6cc85fe7868538125a8034f617eb3136505648", "authors": ["S Louisia", "A Masquelet"], "doi": null, "journal": "Surg Radiol Anat"}
+{"title": null, "sha": "ba766adacf904f5e7e94fa592558cc3c1f45f6fd", "authors": ["Luiz Carlos De Oliveira Cec\u00edlio"], "doi": null, "journal": null}
+{"title": "Header for SPIE use Heterodyne Instrumentation at the CSO", "sha": "ba795eb58b42e7807e1be993cb3bc02ccdfbb672", "authors": ["Jacob Kooi", "P Schaffer", "Bruce Bumble", "Rick Leduc", "T Phillips"], "doi": null, "journal": null}
+{"title": "Mediterranean Diet and Invasive Breast Cancer Risk Among Women at High Cardiovascular Risk in the PREDIMED Trial", "sha": "bac12831a0eab2dac05efc912083d6819aae2ccc", "authors": ["Estefan\u00eda Toledo", "Jordi Salas-Salvad\u00f3", "Carolina Donat-Vargas", "Pilar Buil-Cosiales", "Ram\u00f3n Estruch", "Emilio Ros", "Dolores Corella", "Montserrat Fit\u00f3", "Frank Hu", "Fernando Ar\u00f3s", "Enrique G\u00f3mez-Gracia", "Dora Romaguera", "Manuel Ortega-Calvo", "Llu\u00eds Serra-Majem", "Xavier Pint\u00f3", "Helmut Schr\u00f6der", "Josep Basora", "Jos\u00e9 Sorl\u00ed", "M\u00f2nica Bull\u00f3", "Merce Serra-Mir", "Miguel Mart\u00ednez-Gonz\u00e1lez"], "doi": "10.1001/jamainternmed.2015.4838", "journal": "JAMA Internal Medicine"}
+{"title": "Real-Time Detection of Phishing Tweets", "sha": "bb56465e9b41be35b0bece076ba2a7661450f5d4", "authors": ["Nilesh Sharma", "Nishant Sharma", "Vishakha Tiwari", "Shweta Chahar", "Smriti Maheshwari"], "doi": "10.5121/csit.2014.4727", "journal": "Computer Science & Information Technology ( CS & IT )"}
+{"title": "Principal Components Regression with Data-Chosen Components and Related Methods", "sha": "bb63ab9110158eb8ae24522a226615d878056776", "authors": ["J Hwang", "Dan Nettleton"], "doi": null, "journal": null}
+{"title": "Seismic ray tracing in anisotropic media: A modified Newton algorithm for solving highly nonlinear systems", "sha": "bbc1ff65d8d74296bcd8d4f2984a9ff42b9a031e", "authors": ["Yanghua Wang"], "doi": "10.1190/geo2013-0110.1", "journal": "GEOPHYSICS"}
+{"title": "Neurotoxic lesions at the ventral mesopontine junction change sleep time and muscle activity during sleep: An animal model of motor disorders in sleep", "sha": "bbdbde1775b8133db0a2ad1afd1f4b89ff7a80fe", "authors": ["Y-Y Lai", "K-C Hsieh", "D Nguyen", "J Peever", "J Siegel"], "doi": "10.1016/j.neuroscience.2008.03.085", "journal": "Neuroscience"}
+{"title": "Image Retrieval and Perceptual Similarity", "sha": "bbf94799250c4b1f4cfcd8349fb46025ff0d2f3b", "authors": ["Dirk Neumann", "Karl Gegenfurtner Justus"], "doi": null, "journal": null}
+{"title": "On Multi-User Gain in MIMO Systems with Rate Constraints", "sha": "bc67b9b966fc6fab3aca2cf096386e15afad28ff", "authors": ["Peng Wang", "Li"], "doi": null, "journal": null}
+{"title": "Auditory dysfunction in Ramsay Hunt syndrome", "sha": "bca40e5ac1cb9c42f462295a831e32c052581168", "authors": ["Vicente Iragui", "Vicente Iragui"], "doi": null, "journal": "Neurosurgery, and Psychiatry"}
+{"title": "Notas sobre a an\u00e1lise antropol\u00f3gica de setores do Estado brasileiro", "sha": "bcb9603cdb8d9412b1ffbb88e00e614839b5ce72", "authors": ["Cim\u00e9a Bevilaqua"], "doi": null, "journal": null}
+{"title": "Letter by Chen Regarding Article, \"The Impact of Green Tea and Coffee Consumption on the Reduced Risk of Stroke Incidence in Japanese Population: The Japan Public Health Center-Based Study Cohort\"", "sha": "bcf0bc7ee026de32b15d8dcd46ce3af5467a2864", "authors": ["R Chen"], "doi": "10.1161/strokeaha.113.001601", "journal": "Stroke"}
+{"title": "Attentional modulation of masked repetition and categorical priming in young and older adults", "sha": "bd00dae009aa118aff8a8f2aacd85cd114431630", "authors": ["Ludovic Fabre", "Patrick Lemaire", "Jonathan Grainger"], "doi": "10.1016/j.cognition.2006.10.011", "journal": "Cognition"}
+{"title": "Animal Models of Kennedy Disease", "sha": "bd2c7a55293ed95cf347c2af9866d9b578264189", "authors": ["Diane Merry"], "doi": null, "journal": null}
+{"title": "Does Education Really Disadvantage Women in the Marriage Market?", "sha": "bde6fa6f143443c23f6be8c36c80adf5f78ad423", "authors": ["Elaina Rose"], "doi": null, "journal": null}
+{"title": "The ecological quality status of the Bay of Seine and the Seine estuary: Use of biotic indices", "sha": "be1d1c75cddbbf75eb92638b24a2773cb40a282b", "authors": ["Jean-Claude Dauvin", "Thierry Ruellet", "Nicolas Desroy", "Anne-Laure Janson"], "doi": "10.1016/j.marpolbul.2006.04.010", "journal": "Marine Pollution Bulletin"}
+{"title": "Title Receiver sensitivity improvement for NRZ-OOK signal by optical parametric amplifier-assisted detection", "sha": "be538ea0b14529001bf2d46a0dc9fd843e85b0d8", "authors": ["Y Liang", "; Chui", "Wong"], "doi": null, "journal": "Citation The Asia Communications and Photonics Conference and Exhibition (ACP 2009)"}
+{"title": "\u201cPatch\u201ding Up Our Tumor Signaling Knowledge", "sha": "be73f2c5deee58b28d82aa5e2b016b6e63b4f78f", "authors": ["Scott Atwood", "Ramon Whitson", "Anthony Oro"], "doi": "10.1038/jid.2012.506", "journal": "Journal of Investigative Dermatology"}
+{"title": "Synthesis and biology of oligoethylene glycol linked naphthoxylosides", "sha": "be7acd921dee2277c792f331f6a25aeff0e4b91b", "authors": ["Karin Holmqvist", "Andrea Persson", "Richard Johnsson", "Johanna L\u00f6fgren", "Katrin Mani", "Ulf Ellervik"], "doi": "10.1016/j.bmc.2013.02.062", "journal": "Bioorganic & Medicinal Chemistry"}
+{"title": "Pulse shaping of incoherent light by use of a liquid-crystal modulator array", "sha": "beb8750fbdcba603a9362745c62fdd1ede63051f", "authors": ["V Binjrajka", "C.-C Chang", "A Emanuel", "D Leaird", "A Weiner"], "doi": null, "journal": null}
+{"title": "THE FINE GUIDANCE SENSOR ORBIT OF THE G4 BRIGHT GIANT HD 173764", "sha": "bf514fa6dc19106b7bced910e845ad1e675f37fd", "authors": ["Sidney Parsons", "Otto Franz", "Lawrence Wasserman"], "doi": null, "journal": null}
+{"title": "KATP-channels in beta-cells in tissue slices are directly modulated by millimolar ATP", "sha": "c0b7ce2ec5f57040ea8dddc15d669411f3ee339b", "authors": ["S Speier", "S-B Yang", "K Sroka", "T Rose", "M Rupnik"], "doi": "10.1016/j.mce.2004.11.002", "journal": "Molecular and Cellular Endocrinology"}
+{"title": "Evaluating Electrocatalysts for the Hydrogen Evolution Reaction Using Bipolar Electrode Arrays: Bi- and Trimetallic Combinations of Co, Fe, Ni, Mo, and W", "sha": "c0c1f72eecf03b335ff3a8c5381f36e0d58d7cf1", "authors": ["Stephen Fosdick", "Sean Berglund", "C Mullins", "Richard Crooks"], "doi": "10.1021/cs500168t", "journal": "ACS Catalysis"}
+{"title": "Including non-trade concerns: the environment in EU and US agricultural policy Including non-trade concerns 263", "sha": "c0dd1d6d5d11faffe243bdb6a729b6ea7629ba59", "authors": ["Kathy Baylis", "Gordon Rausser", "Leo Simon"], "doi": null, "journal": "Int. J. Agricultural Resources Governance and Ecology"}
+{"title": "Tailoring the enhanced frequency shift in two-dimensional photonic clusters", "sha": "c0f596c40a060d81957674f028cd931a262c7df3", "authors": ["A Asatryan", "L Botten", "N Nicorovici", "R Mcphedran", "C De Sterke"], "doi": null, "journal": null}
+{"title": "Capacity Bounds for the Gaussian Interference Channel", "sha": "c120df9b9f8baf66d60ee6ea7665ff0265dac2a1", "authors": ["Abolfazl Motahari", "Amir Khandani"], "doi": null, "journal": null}
+{"title": "Regime-Switching and the Estimation of Multifractal Processes *", "sha": "c150bd00f0ef648f937ced7fb214f2b7e347006a", "authors": ["Laurent Calvet", "Adlai Fisher", "F Bollerslev", "R Diebold", "J Engle", "J Hamilton", "N Mackinnon", "D Shephard", "R Smith", "Tsay"], "doi": null, "journal": null}
+{"title": "Fasting hyperglycemia upon hospital admission is associated with higher pneumonia complication rates among the elderly", "sha": "c204f24cd4e7d23778804b05cba2649a6bc253f3", "authors": ["Mario Castellanos", "Anita Szerszen", "Chadi Saifan", "Irina Zigelboym", "Georges Khoueiry", "Nidal Rafeh", "Robert Wetz", "Morton Kleiner", "Nelly Aoun", "Kera Weiserbs", "Theodore Maniatis", "Jeffrey Rothman"], "doi": null, "journal": null}
+{"title": "Effect of androgen deprivation therapy on the expression of prostate cancer biomarkers MSMB and MSMB-binding protein CRISP3", "sha": "c21561a1a7c0dc7f1062c9742c6dc5f0fbbf1785", "authors": ["A Dahlman", "A Edsj\u00f6", "C Halld\u00e9n", "J Persson", "S Fine", "H Lilja", "W Gerald", "A Bjartell"], "doi": "10.1038/pcan.2010.25", "journal": "Prostate Cancer and Prostatic Diseases"}
+{"title": "Testing Monotonicity of Pricing Kernels", "sha": "c21fdc00e8ef984da700c9c8b41be80d5214271f", "authors": ["Yuri Golubev", "Wolfgang H\u00e4rdle", "Roman Timonfeev"], "doi": null, "journal": null}
+{"title": "Steplike magnetization of spin chains in a triangular lattice: Monte Carlo simulations", "sha": "c2944ff3d3756605737f2ab8ea7721f01f23765e", "authors": ["X Yao", "S Dong", "J-M Liu"], "doi": "10.1103/physrevb.73.212415", "journal": "Physical Review B"}
+{"title": "Ultrasonic Measurement of Depth-Dependent Transient Behaviors of Articular Cartilage under Compression", "sha": "c2cda2b48d055ddf227e31d3b4aae758e5da9791", "authors": ["Y Zheng", "H Niu", "F Mak", "Y Huang", "Yongping Zheng"], "doi": null, "journal": "Journal of Biomechanics Submitting as a Communication First Submission"}
+{"title": "Bone Marrow Transplantation Confers Modest Benefits in Mouse Models of Huntington's Disease", "sha": "c2ffbb4ec03c60a144d06736917fb9e251695ab5", "authors": ["W Kwan", "A Magnusson", "A Chou", "A Adame", "M Carson", "S Kohsaka", "E Masliah", "T Moller", "R Ransohoff", "S Tabrizi", "M Bjorkqvist", "P Muchowski"], "doi": "10.1523/jneurosci.4846-11.2012", "journal": "Journal of Neuroscience"}
+{"title": "Analysis of Recursive State Machines Analysis of Recursive State Machines", "sha": "c33a7f02d85e6e17c8509324f8e95483f9db62bb", "authors": ["Rajeev Alur", "; Benedikt", "M Yannakakis"], "doi": null, "journal": "ACM Transactions on Programming Languages and Systems"}
+{"title": "BOUNDARY LAYER THEORY DESCRIPTION OF SOLUTE TRANSPORT IN SOIL", "sha": "c36df3125f1e254cb5627000d63c516bb9a5ad42", "authors": ["Quanjiu Wang", "Robert Horton"], "doi": "10.1097/ss.0b013e31814cee60", "journal": "Soil Science"}
+{"title": "Medicinal Plant Conservation", "sha": "c37efe7039bf47e07a6e22f0045a7752572c2e19", "authors": [], "doi": null, "journal": null}
+{"title": "Involvement of Leptin in the Progression of Experimentally Induced Peritoneal Fibrosis in Mice", "sha": "c3bac0531492c19b1d861ead939f3d9807a8f0a4", "authors": ["Masayuki Nakazawa", "Yoko Obata", "Tomoya Nishino", "Shinichi Abe", "Yuka Nakazawa", "Katsushige Abe", "Akira Furusu", "Masanobu Miyazaki", "Takehiko Koji", "Shigeru Kohno"], "doi": "10.1267/ahc.13005", "journal": "ACTA HISTOCHEMICA ET CYTOCHEMICA"}
+{"title": "Available online at w.sciencedirect.com Gas-Particle Flow and Combustion Characteristics of Pulverized Coal Injection in Blast Furnace Raceway", "sha": "c3f162ad89e1ae3af93e5703e6ddcc9cc76d9f82", "authors": ["&quot; Sciencedirect", "Sheng-Fu Zhang", "Chen-Guang", "Wen Liang-Ying", "Gui-Bao Lu", "Xue-Wei"], "doi": null, "journal": null}
+{"title": "Receptor interacting protein kinase 2\u2013mediated mitophagy regulates inflammasome activation during virus infection", "sha": "c40c38afc8988b23460a3d57eadbc0690d276c5e", "authors": ["Christopher Lupfer", "Paul Thomas", "Paras Anand", "Peter Vogel", "Sandra Milasta", "Jennifer Martinez", "Gonghua Huang", "Maggie Green", "Mondira Kundu", "Hongbo Chi", "Ramnik Xavier", "Douglas Green", "Mohamed Lamkanfi", "Charles Dinarello", "Peter Doherty", "Thirumala-Devi Kanneganti"], "doi": "10.1038/ni.2563", "journal": "Nature Immunology"}
+{"title": "The semantic priming project", "sha": "c44e3aa9b8e318e61c47e706f5545334ef399545", "authors": ["Keith Hutchison", "David Balota", "James Neely", "Michael Cortese", "Emily Cohen-Shikora", "Chi-Shing Tse", "Melvin Yap", "Jesse Bengson", "Dale Niemeyer", "Erin Buchanan"], "doi": "10.3758/s13428-012-0304-z", "journal": "Behavior Research Methods"}
+{"title": "ON GALOIS PROJECTIVE GROUP RINGS Let A be a ring with I, C the center of A and G' an inner automorphism group of A induced by [U in A / O in a finite group G whose order is invertible. Let A G' be the fixed subring of", "sha": "c47ed77d410cae2cb7eee986088679b40edcec0a", "authors": ["George Szeto", "U", "Linjun Ma"], "doi": null, "journal": "Internat. J. Math. & Math. Sci. VOI"}
+{"title": "An introduction to the supplement \u2018A practical approach to the nutritional management of children with cerebral palsy\u2019", "sha": "c4a1f27ca9a4973a1734b38c487b446b311bbc22", "authors": ["F Gottrand", "P Sullivan"], "doi": "10.1038/ejcn.2013.221", "journal": "European Journal of Clinical Nutrition"}
+{"title": "Mobile Ad Hoc Networking Approach to Detecting and Querying Events Related to Farm Animals", "sha": "c5b6edd4b31e2bcfdc625557d2a33212698084c2", "authors": ["Milena Radenkovic", "Bartosz Wietrzyk"], "doi": null, "journal": null}
+{"title": "Evolutionary divergence and functions of the ADAM and ADAMTS gene families", "sha": "c6052041b29ab547dc3a2d05f1a1202769b5ce9b", "authors": ["Chad Brocker", "Vasilis Vasiliou", "Daniel Nebert"], "doi": null, "journal": null}
+{"title": "Efficient and improved synthesis of Telmisartan", "sha": "c61bc681c7932518eb0f66e159a5af6f995b1fd2", "authors": ["A Sanjeev Kumar", "Samir Ghosh", "G Mehta"], "doi": "10.3762/bjoc.6.25", "journal": "Beilstein Journal of Organic Chemistry"}
+{"title": "The effects of repeated thermal therapy for two patients with chronic fatigue syndrome", "sha": "c6d421bffce8136cc9fa18bed1519115a4284a25", "authors": ["Akinori Masuda", "Takashi Kihara", "Tsuyoshi Fukudome", "Takuro Shinsato", "Shinichi Minagoe", "Chuwa Tei"], "doi": "10.1016/j.jpsychores.2004.11.005", "journal": "Journal of Psychosomatic Research"}
+{"title": "Increased androgen receptor expression in serous carcinoma of the ovary is associated with an improved survival", "sha": "c72d0905fd87b615950533056c0600b66e84919d", "authors": ["Bj\u00f6rn Nodin", "Nooreldin Zendehrokh", "Jenny Br\u00e4ndstedt", "Elise Nilsson", "Jonas Manjer", "Donal Brennan", "Karin Jirstr\u00f6m"], "doi": "10.1186/1757-2215-3-14", "journal": "Journal of Ovarian Research"}
+{"title": "Experimental observation using particle image velocimetry of inertial waves in a rotating fluid", "sha": "c75901f73105b3f724350f9f555626242f893203", "authors": ["Laura Messio", "A Cyprien", "Morize Ae", "Marc Ae", "Fr\u00e9d\u00e9ric Moisy"], "doi": null, "journal": null}
+{"title": "Securing Human Rights Intellectually: Philosophical Inquiries about the Universal Declaration", "sha": "c75b4bfa53a31ded383019c5fa173de00718206f", "authors": [], "doi": null, "journal": null}
+{"title": "Aprendizagem ancorada em emo\u00e7\u00f5es (LRL) como estrat\u00e9gia para processos de mudan\u00e7a organizacional 1", "sha": "c787286f60b5afcf9b34c9b0d5132e08cc8f64a5", "authors": ["Ramiro Zinder Da Silva", "Narbal Silva"], "doi": null, "journal": null}
+{"title": "Morphological Characterization of Self-Assembled Peptide Nucleic Acid Amphiphiles", "sha": "c7a942237cf69a0fe6e1b8b512a3760f96ca3040", "authors": ["Cheryl Lau", "Ronit Bitton", "Havazelet Bianco-Peled", "David Schultz", "David Cookson", "Shane Grosser", "James Schneider"], "doi": "10.1021/jp057049h", "journal": "The Journal of Physical Chemistry B"}
+{"title": "A phylogenetic overview of the Agaricomycotina", "sha": "c7d5f924eb6fd54fd79d7e97eee0d4fa5a3450bb", "authors": ["David Hibbett"], "doi": null, "journal": null}
+{"title": "Sa\u00fade bucal no Programa Sa\u00fade da Fam\u00edlia: uma avalia\u00e7\u00e3o do modelo assistencial Oral health in the Brazilian Family Health Program: a health care model evaluation", "sha": "c7f583a38215a23db8d72f15bbc7c82659ebb89a", "authors": ["Cad", "Rio Sa\u00fade P\u00fablica", "De Janeiro"], "doi": null, "journal": null}
+{"title": "A Janus-Like Role of CREB Protein: Enhancement of Synaptic Property in Mature Neurons and Suppression of Synaptogenesis and Reduced Network Synchrony in Early Development", "sha": "c80e94a911e2f77cdce8a9f26adfe9e98e70e837", "authors": ["Mio Nonaka"], "doi": "10.1523/jneurosci.1309-09.2009", "journal": "Journal of Neuroscience"}
+{"title": "How useful are unpublished data from the Food and Drug Administration in meta-analysis?", "sha": "c816804edb9285b6671b652b132086e5976c1191", "authors": ["Catherine Maclean", "Sally Morton", "Joshua Ofman", "Elizabeth Roth", "Paul Shekelle"], "doi": null, "journal": "Journal of Clinical Epidemiology"}
+{"title": "Separating distributed source coding from network coding", "sha": "c84dff7d1b336173b2c320ee7f592c59c9a3413a", "authors": ["A Ramamoorthy", "K Jain", "P Chou", "M Effros"], "doi": "10.1109/tit.2006.874534", "journal": "IEEE Transactions on Information Theory"}
+{"title": "A Discrete Labelling Approach to Attributed Graph Matching Using SIFT Features", "sha": "c8d84fc463d76653494382273577e0fc3b856591", "authors": ["Gerard Sanroma", "Rene Alquezar", "Francesc Serratosa"], "doi": "10.1109/icpr.2010.239", "journal": "2010 20th International Conference on Pattern Recognition"}
+{"title": "Comparison of thallium-201 SPECT redistribution patterns and rubidium-82 PET rest-stress myocardial blood flow imaging *", "sha": "c9236bf7c663698bdc4484acfbe1e623cc2a1fdd", "authors": ["Richard Stewart", "Jeffrey Popma", "Gerald Gacioch", "Morton Kalus", "Sheila Squicciarini", "Ziad A1-Aouar", "M Schork", "Markus Schwaiger"], "doi": null, "journal": "International Journal of Cardiac Imaging"}
+{"title": "American Society of Echocardiography/Society of Cardiovascular Anesthesiologists Recommendations and Guidelines for Continuous Quality Improvement in Perioperative Echocardiography", "sha": "c94d844e8a2d151a9c52307807cf3781f9468e55", "authors": ["Joseph Mathew", "Kathryn Glas", "Christopher Troianos", "Pamela Sears-Rogan", "Robert Savage", "Jack Shanewise", "Joseph Kisslo", "Solomon Aronson", "Stanton Shernan"], "doi": "10.1016/j.echo.2006.08.039", "journal": "Journal of the American Society of Echocardiography"}
+{"title": "Quantum transport of slow charge carriers in quasicrystals and correlated systems", "sha": "c9987d533fdb6921848fc2c25b69fdfa8b12fc2b", "authors": ["Guy Trambly De Laissardi\u00e8re", "Jean-Pierre Julien", "Didier Mayou", "Guy Trambly De Laissardi\u00e8re", "Jean-Pierre Julien", "Mayou Didier"], "doi": "10.1103/PhysRevLett.97.026601>", "journal": null}
+{"title": "Hipercapnia Acentuada Durante Circula\u00e7\u00e3o Extracorp\u00f3rea em Cirurgia para Revasculariza\u00e7\u00e3o do Mioc\u00e1rdio. Relato de Caso * Marked Hypercapnia During Cardiopulmonary Bypass for Myocardial Revascularization. Case Report", "sha": "c99e64426737d1c895436c5c456a74dcbc678cca", "authors": ["Maur\u00edcio Serrano", "Nascimento Tsa", "Cassiano Franco Bernardes", "Tsa", "Roberta Louro De Medeiros"], "doi": null, "journal": null}
+{"title": "Differential Distribution of Stem Cells in the Auditory and Vestibular Organs of the Inner Ear", "sha": "c9b376a4928ae84dad0cf2eacb2bc3d63c14b8d0", "authors": ["Kazuo Oshima", "Christian Grimm", "C Corrales", "Pascal Senn", "Rodrigo Martinez Monedero", "Gwena\u00eblle G\u00e9l\u00e9oc", "Albert Edge", "Jeffrey Holt", "Stefan Heller"], "doi": "10.1007/s10162-006-0058-3", "journal": "Journal of the Association for Research in Otolaryngology"}
+{"title": "How We Teach Experience with a theme-based integrated renal module for a second-year MBBS class Downloaded from", "sha": "ca480f120dafad3e18c8d71120708e8b1b44a090", "authors": ["Riffat Shafi", "K Quadri", "Waseem Ahmed", "Syed Mahmud", "Mobeen Iqbal"], "doi": null, "journal": "Adv Physiol Educ"}
+{"title": "Protection performance components in MPLS networks", "sha": "ca904796998dca30a520b2639015eb01f6395852", "authors": ["Eusebi Calle", "Jos\u00e9 Marzo", "Anna Urra"], "doi": "10.1016/j.comcom.2004.02.025", "journal": "Computer Communications"}
+{"title": "Discriminating short-range from van der Waals forces using total force data in noncontact atomic force microscopy", "sha": "ca9431ff09746a7e046a955d446da9b651378c9a", "authors": ["Stefan Kuhn", "Philipp Rahe"], "doi": "10.1103/physrevb.89.235417", "journal": "Physical Review B"}
+{"title": "A GENERALIZATION OF ABSOLUTE RETRACTS", "sha": "cad8307fcb942203e9a929a352d8a2057dcb9743", "authors": ["John Martini"], "doi": null, "journal": "PROCEEDINGS OF THE AMERICAN MATHEMATICAL SOCIETY"}
+{"title": "Low Complexity Optimal Joint Detection for Oversaturated Multiple Access Communications", "sha": "caf7637cb58ce5ecf67e67c1572441bdbfa9bf92", "authors": ["Rachel Learned", "Alan Willsky", "Don Boroson"], "doi": null, "journal": "IEEE TRANSACTIONS ON SIGNAL PROCESSING"}
+{"title": "Ya.B. Zeldovich and equation of state problems for matter under extreme conditions", "sha": "cb369159788956668cb3240703217539eacaacb9", "authors": ["Vladimir Fortov", "Igor Lomonosov"], "doi": "10.3367/ufnr.0184.201403b.0231", "journal": "Uspekhi Fizicheskih Nauk"}
+{"title": "Avalia\u00e7\u00e3o da Aplicabilidade de T\u00e9cnicas MIC/FT-IR/DSC para a Caracteriza\u00e7\u00e3o de Filmes Multicamadas", "sha": "cb4ab458c9af3538893c6ac6e067d69c9a451438", "authors": ["Luciano Nogueira", "Rita Dutra", "Milton Diniz", "Marcia Pires", "M\u00f4nica Evangelista", "Fernanda Santana", "Leandro Tomasi", "Priscila Dos Santos", "Regina Nonemacher"], "doi": null, "journal": "Pol\u00edmeros: Ci\u00eancia e Tecnologia"}
+{"title": "Thermal properties, miscibility and specific interactions in comparison of linear and star poly(methyl methacrylate) blend with phenolic", "sha": "cbd9a1cb400c552f7dfd239658c058f1b6afedae", "authors": ["Chih-Feng Huang", "Shiao-Wei Kuo", "Han-Ching Lin", "Jem-Kun Chen", "Yu-Kai Chen", "Hongyao Xu", "Feng-Chih Chang"], "doi": "10.1016/j.polymer.2004.05.043", "journal": "Polymer"}
+{"title": null, "sha": "cc2df68f5e628916d0b49661943309505a07717c", "authors": [], "doi": null, "journal": null}
+{"title": "When chromatin meets splicing", "sha": "cc38e0991211fe76b55c9002fb3263182d004ce8", "authors": ["Alberto Kornblihtt", "Ignacio Schor", "Mariano Allo", "Alberto Kornblihtt", "Ignacio Schor", "Mariano Allo", "Benjamin Blencowe"], "doi": null, "journal": null}
+{"title": "Neural correlates of feeling sympathy", "sha": "cccaae1e95ac9611194a6bb12f8f63bea0dce4d2", "authors": ["Jean Decety", "Thierry Chaminade"], "doi": null, "journal": "Neuropsychologia"}
+{"title": "European External Quality Control Study on the Competence of Laboratories to Recognize Rare Sequence Variants Resulting in Unusual Genotyping Results", "sha": "ccedb90e95bc0b5b4de91236f6c064f82af726b7", "authors": ["J Marki-Zay", "C Klein", "D Gancberg", "H Schimmel", "L Dux"], "doi": "10.1373/clinchem.2008.112102", "journal": "Clinical Chemistry"}
+{"title": null, "sha": "cd3752fc84f00134c7a613a2a1312fa504842753", "authors": [], "doi": "10.1016/j.jet.2011.08.001)", "journal": null}
+{"title": "CONCENTRATION PROFILE IN THE GRADIENT ZONE OF SMALL SOLAR PONDS", "sha": "cd4103ed2ca6b97d13595264b517dbb8be09c23f", "authors": ["J Sr1nivasan", "Abhijit Guha"], "doi": null, "journal": null}
+{"title": "Multi-criteria analysis of alternative-fuel buses for public transportation", "sha": "cd43b2c2a2a33e508f01df4fe9480c1a244cebde", "authors": ["Gwo-Hshiung Tzeng", "Cheng-Wei Lin", "Serafim Opricovic"], "doi": "10.1016/j.enpol.2003.12.014", "journal": "Energy Policy"}
+{"title": "A conservation plan for a global biodiversity hotspot\u2014the Cape Floristic Region, South Africa", "sha": "cd4e3f97d7a0483ee45181ef32021d67b94ec273", "authors": ["R Cowling", "R Pressey", "M Rouget", "A Lombard"], "doi": "10.1016/s0006-3207(02)00425-1", "journal": "Biological Conservation"}
+{"title": "THE ANGULAR MOMENTUM OF MAGNETIZED MOLECULAR CLOUD CORES: A TWO-DIMENSIONAL-THREE-DIMENSIONAL COMPARISON", "sha": "cd7c3720973d76373e2d8f960ab9fc37408506de", "authors": ["Sami Dib", "Patrick Hennebelle", "Jaime Pineda", "Timea Csengeri", "Sylvain Bontemps", "Edouard Audit", "Alyssa Goodman"], "doi": "10.1088/0004-637x/723/1/425", "journal": "The Astrophysical Journal"}
+{"title": "Location-based grid-index for spatial query processing", "sha": "cda765ca96c26e755457955f64795e3f4c3faf07", "authors": ["Kwangjin Park"], "doi": "10.1016/j.eswa.2013.08.027", "journal": "Expert Systems with Applications"}
+{"title": "State-dependent symplecticity and area preserving numerical methods", "sha": "cddebc14a4480c21aca2ee2057b85b6cd97d1cc1", "authors": ["Felice Iavernaro", "Donato Trigiante"], "doi": "10.1016/j.cam.2006.02.058", "journal": "Journal of Computational and Applied Mathematics"}
+{"title": "Pulmonary function and fuel use: A population survey", "sha": "ce3f33f4a8183294ea1b4bc213961b1bdd0ca7b0", "authors": ["Asim Saha", "N Mohan Rao", "P Kulkarni", "P Majumdar", "H Saiyed"], "doi": "10.1186/1465-9921-6-127", "journal": "Respiratory Research"}
+{"title": "A Reference Architecture for Scientific Workflow Management Systems and the VIEW SOA Solution", "sha": "ce60c5ee80f26ea0d18a292d9ebaf7b4bc27b152", "authors": ["Cui Lin", "Shiyong Lu", "Xubo Fei", "A Chebotko", "Darshan Pai", "Zhaoqiang Lai", "F Fotouhi", "Jing Hua"], "doi": "10.1109/tsc.2009.4", "journal": "IEEE Transactions on Services Computing"}
+{"title": "ACCESSING VIDEO ARCHIVES USING INTERACTIVE SEARCH", "sha": "ce71a60c44a5343cd80c3695a6ac5f4d72210cef", "authors": ["M Worring", "G Nguyen", "L Hollink", "J Van Gemert", "D Koelma"], "doi": null, "journal": null}
+{"title": "Activity of azithromycin or erythromycin in combination with antimalarial drugs against multidrug-resistant Plasmodium falciparum in vitro", "sha": "ce7e77d79cecc829afa035d742b43915914ce462", "authors": ["S Nakornchai", "P Konthiang"], "doi": "10.1016/j.actatropica.2006.10.008", "journal": "Acta Tropica"}
+{"title": "Orthodontic movement of impacted cuspid in fibrodysplastic bone: A case report", "sha": "cee2ea38de1f469a6667e25947a2eaedcf5492e3", "authors": ["Giuseppe Colella", "Angelo Itro", "Letizia Perillo", "Rosangela Cannavale"], "doi": "10.1016/j.bone.2009.08.049", "journal": "Bone"}
+{"title": "Preciseness of Subtyping on Intersection and Union Types", "sha": "cf1d2c008f6ed3722301d2fe808b22d2aaa4927e", "authors": ["Mariangiola Dezani-Ciancaglini", "Silvia Ghilezan"], "doi": null, "journal": null}
+{"title": "Learning to Write Together Using Groupware", "sha": "cf6c3f2d7722735c6f401f54d26932c9dc6bd8e2", "authors": ["Ilona Posner", "Alex Mitchell", "Ronald Baecker"], "doi": null, "journal": "Appears in Computer Supported Cooperative Writing"}
+{"title": "Designing on-site: Facilitating participatory contextual architecture with mobile phones", "sha": "cf8640291b4fa77011b9fda161f7c1e8ed07b3af", "authors": ["Mikael Skov", "Jesper Kjeldskov", "Jeni Paay", "Niels Husted", "Jacob N\u00f8rskov", "Kenneth Pedersen"], "doi": "10.1016/j.pmcj.2012.05.004", "journal": "Pervasive and Mobile Computing"}
+{"title": "High-throughput and combinatorial gene expression on a chip for metabolism-induced toxicology screening", "sha": "cfa9b6ec9635733ce88b10f55bc78fef3f27da4c", "authors": ["Seok Kwon", "Dong Lee", "Dhiral Shah", "Bosung Ku", "Sang Jeon", "Kusum Solanki", "Jessica Ryan", "Douglas Clark", "Jonathan Dordick", "Moo-Yeal Lee"], "doi": "10.1038/ncomms4739", "journal": "Nature Communications"}
+{"title": "Dense Subgraph Partition of Positive Hypergraphs", "sha": "cfe5f4fe4bd428aaed47cba12ef8fbd8617692b9", "authors": ["Hairong Liu", "Longin Latecki", "Shuicheng Yan"], "doi": "10.1109/tpami.2014.2346173", "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence"}
+{"title": "Incidence and Clinical Signii cance of Inducible Atrial Tachycardia in Patients with Atrioventricular Nodal Reentrant Tachycardia", "sha": "d00212db0e4ae4a5709e2b74f6342b171fcb9c33", "authors": ["Christian Sticherling", "Hiroshi Tada", "Radmira Greenstein", "Chi-Wo Chan", "Steven Chough", "Robert Baker", "Kristina Wasmer", "Frank Pelosi", "Bradley Knight", "S Strickberger", "Fred Morady"], "doi": null, "journal": "J Cardiovasc Electrophysiol"}
+{"title": "The preprophase band is a localized center of clathrin-mediated endocytosis in late prophase cells of the onion cotyledon epidermis", "sha": "d038ae96b106a7d7d34e636df4a75b2df88f1095", "authors": ["Ichirou Karahara", "Jinsuke Suda", "Hiroshi Tahara", "Etsuo Yokota", "Teruo Shimmen", "Kazuyo Misaki", "Shigenobu Yonemura", "Lucas Staehelin", "Yoshinobu Mineyuki"], "doi": "10.1111/j.1365-313x.2008.03725.x", "journal": "The Plant Journal"}
+{"title": "INTERNATIONAL CENTRE FOR THEORETICAL PHYSICS NULL GEODESICS IN BLACK HOLE METRICS WITH NON-ZERO COSMOLOGICAL CONSTANT INTERNATIONAL ATOMIC ENERGY AGENCY UNITED NATIONS EDUCATIONAL, SCIENTIFIC AND CULTURAL ORGANIZATION", "sha": "d070cee4af7f04b3e7b23b0ee0369c1d93deffda", "authors": ["Zdenek Stuchlik", "Massimo Calvani"], "doi": null, "journal": null}
+{"title": "Coupled Ocean-Atmosphere Inter-Decadal Modes in the Tropics", "sha": "d08c93f8093c2e94765114b44c7f23da981ca1a1", "authors": ["B Goswami", "Mann Thomas"], "doi": null, "journal": "Journal of the Meteorological Society of Japan"}
+{"title": "Genetic operators for combinatorial optimization in TSP and microarray gene ordering", "sha": "d0a6d13460ebdd8259590f1c014ff51353802af3", "authors": ["Shubhra Ray", "Sanghamitra Bandyopadhyay", "Sankar Pal"], "doi": "10.1007/s10489-006-0018-y", "journal": "Applied Intelligence"}
+{"title": "GIS-Based Multicriteria Evaluation Approach in Planning Tourism Development Sites in Environmentally Sensitive Areas", "sha": "d1a88cf2ee5f613865ea355364bd0de5f991e241", "authors": ["Norhidayah Harun", "Narimah Samat"], "doi": "10.1051/shsconf/20162302001", "journal": "SHS Web of Conferences"}
+{"title": "Entanglement generation by adiabatic navigation in the space of symmetric multiparticle states", "sha": "d2037a955d3e90bfea3441ecb460d76aaf1606be", "authors": ["Razmik Unanyan", "Michael Fleischhauer", "Nikolay Vitanov", "Klaas Bergmann"], "doi": "10.1103/physreva.66.042101", "journal": "Physical Review A"}
+{"title": null, "sha": "d22015b357a39b49942c45ca817b281bd0380f01", "authors": [], "doi": null, "journal": null}
+{"title": "U^Th dating of marine isotope stage 7 in Bahamas slope sediments", "sha": "d2a32e5086fc1e50885b30a51f1c128af3287a72", "authors": ["Laura Robinson", "Gideon Henderson", "Niall Slowey"], "doi": null, "journal": null}
+{"title": "Combining the Species-Area-Habitat Relationship and Environmental Cluster Analysis to Set Conservation Priorities: a Study in the Zhoushan Archipelago, China", "sha": "d2d06507bd3db01b3c42e16cddc203b86518a138", "authors": ["You-Hua Chen"], "doi": "10.1111/j.1523-1739.2008.01084.x", "journal": "Conservation Biology"}
+{"title": "Sustained Quantum Coherence and Entanglement in the Avian Compass", "sha": "d2e897e28d196c17d735ed9f539f2193d6fd29b3", "authors": ["Erik Gauger", "Elisabeth Rieper", "John Morton", "Simon Benjamin", "Vlatko Vedral"], "doi": "10.1103/physrevlett.106.040503", "journal": "Physical Review Letters"}
+{"title": "Application of Plasma Immersion Ion Implantation Doping to Low-Temperature Processed Poly-Si TFT's", "sha": "d33bf5ea4bceeca74a24118cf6fe360a3f4212aa", "authors": ["Ching-Fa Yeh", "Tai-Ju Chen", "Chung Liu", "Jiqun Shao", "Nathan Cheung"], "doi": null, "journal": "IEEE ELECTRON DEVICE LETTERS"}
+{"title": "In Search for a Better Marker of Acute Pancreatitis--Third Time Lucky?", "sha": "d37edd4b819188ce01e6dfc92fa9442e02658660", "authors": ["A Viljoen", "J Patrick"], "doi": "10.1373/clinchem.2011.173385", "journal": "Clinical Chemistry"}
+{"title": "A Priori Error Estimates for the Finite Element Discretization of Elliptic Parameter Identification Problems with Pointwise Measurements", "sha": "d3b3aa88fcba0925645d8fc9ff998b038ce52e5c", "authors": ["R Rannacher", "B Vexler"], "doi": "10.1137/040611100", "journal": "SIAM Journal on Control and Optimization"}
+{"title": "Measurement of the Mass Difference between t and t Quarks", "sha": "d4e24593396ffc0f58078b849c649edcb6924c80", "authors": ["T Aaltonen", "B A \u00b4 Lvarez Gonz\u00e1lez", "S Amerio", "D Amidei", "A Anastassov", "A Annovi", "J Antos", "G Apollinari", "J Appel", "A Apresyan", "T Arisawa", "A Artikov", "J Asaadi", "W Ashmanskas", "B Auerbach", "A Aurisano", "F Azfar", "W Badgett", "A Barbaro-Galtieri", "V Barnes", "B Barnett", "P Barria", "P Bartos", "M Bauce", "G Bauer", "F Bedeschi", "D Beecher", "S Behari", "G Bellettini", "J Bellinger", "D Benjamin", "A Beretvas", "A Bhatti", "M Binkley", "D Bisello", "I Bizjak", "K Bland", "B Blumenfeld", "A Bocci", "A Bodek", "D Bortoletto", "J Boudreau", "A Boveia", "B Brau", "L Brigliadori", "A Brisuda", "C Bromberg", "E Brucken", "M Bucciantonio", "J Budagov", "H Budd", "S Budd", "K Burkett", "G Busetto", "P Bussey", "A Buzatu", "S Hahn", "E Halkiadakis", "A Hamaguchi", "J Han", "F Happacher", "K Hara", "D Hare", "M Hare", "R Harr", "K Hatakeyama", "C Hays", "M Heck", "J Heinrich", "M Herndon", "S Hewamanage", "D Hidas", "A Hocker", "W Hopkins", "D Horn", "S Hou", "R Hughes", "M Hurwitz", "U Husemann", "N Hussain", "M Hussein", "J Huston", "G Introzzi", "M Iori", "A Ivanov", "E James", "D Jang", "B Jayatilaka", "E Jeon", "M Jha", "S Jindariani", "W Johnson", "M Jones", "K Joo", "S Jun", "T Junk", "T Kamon", "P Karchin", "Y Kato", "W Ketchum", "J Keung", "V Khotilovich", "B Kilminster", "D Kim", "H Kim", "H Kim", "J Kim", "M Kim", "S Kim", "S Kim", "Y Kim", "N Kimura", "M Kirby", "S Klimenko", "K Kondo", "D Kong", "J Konigsberg", "A Kotwal", "M Kreps", "J Kroll", "D Krop"], "doi": null, "journal": null}
+{"title": "The isolation of microsatellite loci in the Mediterranean fruitfly Ceratitis capitata (Diptera: Tephritidae) using a biotin / streptavidin enrichment technique", "sha": "d54783dcc722251bdc57a8e774889bb464b2fbc5", "authors": ["Blackwell Science", "Ltd", "D", "S", "A B U R N E L L"], "doi": null, "journal": "Molecular Ecology Notes"}
+{"title": "The development of the EULAR-OMERACT rheumatoid arthritis MRI reference image atlas", "sha": "d69351a9073651ece8c9ace1ba3b91cee9b5f2e3", "authors": ["P Bird"], "doi": "10.1136/ard.2004.031807", "journal": "Annals of the Rheumatic Diseases"}
+{"title": "Essential Oils and Latices as Novel Antiviral Agent Against Potato Leaf Roll Virus and Analysis of Their Phytochemical Constituents Responsible for Antiviral Activity", "sha": "d6b0e6d8ffdc2e792eec27198989dc1261016887", "authors": ["Sehrish Iftikhar", "Ahmad Shahid", "Shabnam Javed", "Idrees Nasir", "Bushra Tabassum", "M Haider"], "doi": "10.5539/jas.v5n7p167", "journal": "Journal of Agricultural Science"}
+{"title": "Melem (2,5,8-Triamino-tri-s-triazine), an Important Intermediate during Condensation of Melamine Rings to Graphitic Carbon Nitride: Synthesis, Structure Determination by X-ray Powder Diffractometry, Solid-State NMR, and Theoretical Studies", "sha": "d785e8b5d186d918411cf42294d848760c3ace24", "authors": ["Barbara J\u00fcrgens", "Elisabeth Irran", "J\u00fcrgen Senker", "Peter Kroll", "Helen M\u00fcller", "Wolfgang Schnick"], "doi": "10.1021/ja0357689", "journal": "Journal of the American Chemical Society"}
+{"title": "Fault-Tolerant Robot Programming through Simulation with Realistic Sensor Models", "sha": "d7862185db4df3b54ed91e897ce498e25235dd8c", "authors": ["Thomas Br\u00e4unl", "Andreas Koestler", "Axel Waggershauser"], "doi": null, "journal": "International Journal of Advanced Robotic Systems"}
+{"title": "The Minimum Size of Qubit Unextendible Product Bases", "sha": "d7a8d81ae1d43503502cdb7d1c561f2cadeee03d", "authors": ["Nathaniel Johnston"], "doi": "10.4230/LIPIcs.TQC.2013.93", "journal": "licensed under Creative Commons License CC-BY 8th Conference on Theory of Quantum Computation, Communication and Cryptography"}
+{"title": "Extending UML/MARTE to Support Discrete Controller Synthesis, Application to Reconfigurable Systems-on-Chip Modeling", "sha": "d7e433750b0bb782eae277bca97db3ab74c551ea", "authors": ["S\u00e9bastien Guillet", "Florent Lamotte", "Nicolas Griguer", "\u00c9ric Rutten", "Guy Gogniat", "Jean-Philippe Diguet"], "doi": "10.1145/2629628", "journal": "ACM Transactions on Reconfigurable Technology and Systems"}
+{"title": "Finding optimal paths in MREP routing", "sha": "d7e97c0b43a7ec0eba143be18cfbb8a525c7afc1", "authors": ["Rudolf Fleischer", "Mordecai Golin", "Chin-Tau Lea", "Steven Wong"], "doi": "10.1016/j.ipl.2003.10.005", "journal": "Information Processing Letters"}
+{"title": "Different dietary restriction regimens extend lifespan by both independent and overlapping genetic pathways inC. elegans", "sha": "d8dc02b95fcb8e12d38a444d06aa42cd9204ec93", "authors": ["Eric Greer", "Anne Brunet"], "doi": "10.1111/j.1474-9726.2009.00459.x", "journal": "Aging Cell"}
+{"title": "Objects and Classes CS211 Fall 2000 2", "sha": "d9283460d1b1f10cfb5f9c3a4ffbf4e8d3a70f92", "authors": [], "doi": null, "journal": null}
+{"title": "Biomimetic Fiber-Reinforced Compound Materials", "sha": "d95e221c6ced102ab185ac0099dfd1cc20d1d486", "authors": ["Tom Masselter", "Thomas Speck"], "doi": null, "journal": null}
+{"title": "methodological-experimental Continuity", "sha": "d990ae8b0b55d8b39e807b9cb8108e891ee2e135", "authors": ["Steffen Ducheyne"], "doi": null, "journal": null}
+{"title": "Statistics of polarization-dependent gain in fiber-based Raman amplifiers", "sha": "d9cc640d9b58a0588039a42d6e5dc634bbab4d72", "authors": ["Q Lin", "Govind Agrawal"], "doi": null, "journal": null}
+{"title": "Quantitative 3D Assessment of Myocardial Viability with MRI Delayed Contrast Enhancement", "sha": "d9d9cc9113148ec4f423a29d5f05231ae39f0c7c", "authors": ["V Positano", "M Santarelli", "A Pingitore", "M Lombardi", "L Landini", "A Benassi"], "doi": null, "journal": null}
+{"title": "Encapsulation of Sn@carbon Nanoparticles in Bamboo-like Hollow Carbon Nanofibers as an Anode Material in Lithium-Based Batteries", "sha": "da060cc84f015eda7240c725e34b3c3bc19220b1", "authors": ["Yan Yu", "Lin Gu", "Chunlei Wang", "Abirami Dhanabalan", "Peter Van\u2005aken", "Joachim Maier"], "doi": "10.1002/anie.200901723", "journal": "Angewandte Chemie International Edition"}
+{"title": "Article No. anbe", "sha": "da11b5f0fb60fa0312aa87ff3039659ac57f921c", "authors": ["Sean Walker", "Samuel Marshall", "Ann Rypstra", "Douglas Taylor"], "doi": null, "journal": "ANIMAL BEHAVIOUR"}
+{"title": "Inference in hybrid Bayesian networks", "sha": "da591c9521b29e7dd5106f020ec7f4ca5ed6ffc5", "authors": ["Helge Langseth", "Thomas Nielsen", "Rafael Rum\u00ed", "Antonio Salmer\u00f3n"], "doi": "10.1016/j.ress.2009.02.027", "journal": "Reliability Engineering & System Safety"}
+{"title": "Surface energies of several ceramics with NaCl structure", "sha": "da7b1ce7d8952dad798dba5a97082969dd0eeae4", "authors": ["W Liu", "X Liu", "W Zheng", "Q Jiang"], "doi": "10.1016/j.susc.2005.10.035", "journal": "Surface Science"}
+{"title": "ATOMIC AND PLASMA-MATERIAL INTERACTION DATA FOR FUSION VOLUME 14", "sha": "da803e65874eabd61c1d1e4110972a58382b19ae", "authors": [], "doi": null, "journal": null}
+{"title": "Determination of the radiative decay width of the r/c meson ARGUS Collaboration", "sha": "daa7b1259a2b7697272f4d9c2165b0f49d411ee8", "authors": ["H Albrecht", "T Hamacher", "R Hofmann", "T Kirchhoff", "R Mankel", "A Nau", "S Nowak", "D Rebing", "H Schr6der", "H Schulz", "M Walter", "R Wurth", "C Hast", "H Kapitza", "H Kolanoski", "A Kosche", "A Lange", "A Lindner", "M Schieber", "T Siegmund", "B Spaan", "H Thurn", "D T6pfer", "D Wegener", "Eckstein", "M Schmidtler", "M Schramm", "K Schubert", "R Schwierz", "R Waldi", "K Reim", "H Wegener", "R Eckmann", "H Kuipers", "O Mai", "R Mundt", "T Oest", "R Reiner", "W Schmidt-Parzefall", "Stiewe", "S Wemer", "K Ehret", "W Hofmann", "A Htipper", "K Kn6pfle", "J Spengler", "P Krieger", "D Macfarlane", "J Prentice", "P Saull", "K Tzamariudaki", "R Van De Water", "T.-S Yoon", "C Frankl", "M Schneider", "S Weseler", "G Kernel", "J", "P Kri~anj", "E Kri~ni~j", "T Podobnikj", "T Zivkoj", "V Balagura", "I Belyaev", "S Chechelnitsky", "M Danilov", "A Droutskoy", "Yu Gershtein", "A Golutvin", "I Korolko", "G Kostina", "D Litvintsev", "V Lubimov", "Pakhlov", "S Semenov", "A Snizhko", "I Tichomirov", "Yu Zaitsev", "J Lnstitut", "J Stefan", "Oddelek Za"], "doi": null, "journal": "Physics Letters B"}
+{"title": "Ambient Intelligence-Industrial Research on a Visionary Concept", "sha": "db1c87c022f9abe4f374a8e1e9af951c9b111608", "authors": ["Werner Weber"], "doi": null, "journal": null}
+{"title": null, "sha": "db3273a22cd4a361665a3ccbcde4f0212ad77fce", "authors": [], "doi": null, "journal": null}
+{"title": "Comments on \u201cSolvation Parameters. 2. A Simplified Molecular Topology To Generate Easily Optimized Values\u201d", "sha": "db469456a9b816e49f0d9df1f02f0e72723a53b5", "authors": ["Christina Mintz", "William Acree,", "Michael Abraham"], "doi": "10.1021/ci600254x", "journal": "Journal of Chemical Information and Modeling"}
+{"title": "Field-Responsive Colloidal Suspensions in Microgravity", "sha": "db704f16e53c143f0bfc6042ae017df399899cce", "authors": ["Eric Furst", "Paula Vasquez", "Eric Bennung", "Michael Boyle", "Malvika Ogale", "Juan Agui", "Donna Bohman", "Charles Bunnell", "Peggy Whitson"], "doi": null, "journal": null}
+{"title": "Politics and Power in the European Convention", "sha": "db881b623dfe5aba8d119953f126150c08df607c", "authors": ["Ben Crum"], "doi": null, "journal": null}
+{"title": "Transforming Linear Context-Free Rewriting Systems into Minimalist Grammars \u22c6", "sha": "db93dd5dc26bbe125380f862ad0a268538305c49", "authors": ["Jens Michaelis"], "doi": null, "journal": null}
+{"title": "RAPID COMMUNICATIONS PHYSICAL REVIEW B 15 MAY 1997-II VOLUME", "sha": "dbb9ff618c33e289046030612a0a1a36d7a4b4f7", "authors": ["M Nisoli", "S De Silvestri", "A Cavalleri", "A Malvezzi", "A Stella", "G Lanzani", "P Cheyssac", "R Kofman"], "doi": null, "journal": "NUMBER"}
+{"title": "Appendices A-D A Scalable Method for Deductive Generalization in the Spreadsheet Paradigm APPENDIX B. COMPUTATIONAL EQUIVALENCE BETWEEN CONCRETE AND GENERALIZED FORMULAS", "sha": "dcfc20eeef5430d890d076672d2d4aa477918687", "authors": ["Margaret Burnett"], "doi": null, "journal": "ACM Transactions on Computer-Human Interaction"}
+{"title": "MARKET FRICTIONS AS BUILDING BLOCKS OF AN ORGANIZATIONAL ECONOMICS APPROACH TO STRATEGIC MANAGEMENT", "sha": "dd069e431c76a009ebbcdc6a089c629409d3a9e5", "authors": ["Joseph Mahoney", "Lihong Qian"], "doi": null, "journal": "Strategic Management Journal Strat. Mgmt. J"}
+{"title": "Silencing MicroRNA-155 Ameliorates Experimental Autoimmune Encephalomyelitis", "sha": "dd390b175a80970781447a5907aea75a4ec1d47b", "authors": ["G Murugaiyan", "V Beynon", "A Mittal", "N Joller", "H Weiner"], "doi": "10.4049/jimmunol.1003952", "journal": "The Journal of Immunology"}
+{"title": "Conceptual measurement framework for help-seeking for mental health problems", "sha": "dd53bba734be518f478576598a8c25068b5152c1", "authors": ["Debra Rickwood", "Thomas"], "doi": "10.2147/prbm.s38707", "journal": "Psychology Research and Behavior Management"}
+{"title": "Global Optimization and Constraint Satisfaction: The Branch-and-Reduce Approach", "sha": "dd5ba4cb46a5457c8065afb9fa096bde00406bb3", "authors": ["Nikolaos Sahinidis"], "doi": null, "journal": null}
+{"title": "Patients with Myocardial Infarction and 678 Myocardial Infarction and Normal Coronary Arteriogram (Legrand et a!) Normal Coronary Arteriogram*", "sha": "dd9f62c56cec719e8c45968f59f75ad8390e3e99", "authors": ["V Legrand", "M Deliege", "L Henrard", "H Kulbertus"], "doi": null, "journal": null}
+{"title": "Achievable Performance of Dynamic Channel Assignment Schemes under Varying Reuse Constraints", "sha": "ddb810d68888ac681ea10ad004dc0c0b551d6ac9", "authors": ["Sem Borst", "Phil Whiting"], "doi": null, "journal": "IEEE TRANSACTIONS ON VEHICULAR TECHNOLOGY"}
+{"title": "A Demonstration of Dialogue Processing in SimSensei Kiosk", "sha": "ddd3f37f54d6fd70ef36f436628214d984362add", "authors": ["Fabrizio Morbini", "David Devault", "Kallirroi Georgila", "Ron Artstein", "David Traum", "Louis-Philippe Morency"], "doi": null, "journal": "Proceedings of the SIGDIAL 2014 Conference"}
+{"title": "CONTINUOUS MAPPINGS FROM CANTOR SPACES ONTO INVERSE LIMIT SPECTRA", "sha": "de1abc0d203586a996e3047a6c2345733b5dabe5", "authors": ["Alan Schoenfeld"], "doi": null, "journal": "PROCEEDINGS OF THE AMERICAN MATHEMATICAL SOCIETY"}
+{"title": "Stochastic multiresonance in a chaotic map with fractal basins of attraction", "sha": "de2f2837ad3197807270c58e1617b107c4db4168", "authors": ["S Matyja\u015bkiewicz", "A Krawiecki", "J Ho\u0142yst", "K Kacperski", "W Ebeling"], "doi": "10.1103/physreve.63.026215", "journal": "Physical Review E"}
+{"title": null, "sha": "de7ab1c82e7a74993dd76e7c3a634133d5a29d5a", "authors": [], "doi": null, "journal": null}
+{"title": "DNA Commission of the International Society for Forensic Genetics (ISFG): Recommendations regarding the role of forensic genetics for disaster victim identification (DVI)", "sha": "deb292212a04799c016cf189cf6a63e9327488d8", "authors": ["M Prinz", "A Carracedo", "W Mayr", "N Morling", "T Parsons", "A Sajantila", "R Scheithauer", "H Schmitter", "P Schneider"], "doi": "10.1016/j.fsigen.2006.10.003", "journal": "Forensic Science International: Genetics"}
+{"title": "BAROCKE THEMATIK", "sha": "debe170a4804971677c3c0cf5e6009eb59cbafdb", "authors": [], "doi": null, "journal": null}
+{"title": "Critical state plasticity. Part VI: Meso-scale finite element simulation of strain localization in discrete granular materials", "sha": "df0fb045f0f1cdb503ca9b945a3d1b0a2fb6f36e", "authors": ["Ronaldo Borja", "Jos\u00e9 Andrade"], "doi": "10.1016/j.cma.2005.08.020", "journal": "Computer Methods in Applied Mechanics and Engineering"}
+{"title": "How inhibiting nitrification affects nitrogen cycle and reduces environmental impacts of anthropogenic nitrogen input", "sha": "df571d45f4bf2c1ac9d557e27bcdc454d53eff0d", "authors": ["Chunlian Qiao", "Lingli Liu", "Shuijin Hu", "Jana Compton", "Tara Greaver", "Quanlin Li"], "doi": "10.1111/gcb.12802", "journal": "Global Change Biology"}
+{"title": "Partially Overlapped Channels Not Considered Harmful", "sha": "df5afbdb6bd25df496a429b1409283531701849d", "authors": ["Arunesh Mishra", "Vivek Shrivastava", "Suman Banerjee", "William Arbaugh"], "doi": null, "journal": null}
+{"title": "Comparison of effectiveness of cefovecin, doxycycline, and amoxicillin for the treatment of experimentally induced early Lyme borreliosis in dogs", "sha": "df9ac6070e1c832dc52f872ed9c71c6c7885c812", "authors": ["Bettina Wagner", "John Johnson", "David Garcia-Tapia", "Nicole Honsberger", "Vickie King", "Catherine Strietzel", "John Hardham", "Thomas Heinz", "Richard Marconi", "Patrick Meeus"], "doi": "10.1186/s12917-015-0475-9", "journal": "BMC Veterinary Research"}
+{"title": "Single-chip FPGA Implementation of a Sensorless Speed Control IC for Permanent Magnet Synchronous Motors", "sha": "e067e437b60feeaf4b38339988a911e58f5b5d02", "authors": ["Yen-Chuan Chang", "Ying-Yu Tzou"], "doi": null, "journal": null}
+{"title": "Spatial walking solitons in quadratic nonlinear crystals", "sha": "e0864d996ae0acf38717a96312f8c6b4ad01f7a7", "authors": ["Lluis Torner", "Dumitru Mihalache", "Dumitru Mazilu", "Maria Santos", "Nail Akhmediev"], "doi": null, "journal": null}
+{"title": "A study of temperature effect on chemical, structural and transport parameters determined for two different regenerated cellulose membranes", "sha": "e09d4d3ee6b7cea5fd293c079b05b87a131e6eb6", "authors": ["M V\u00e1zquez", "J Benavente"], "doi": "10.1016/s0376-7388(03)00179-0", "journal": "Journal of Membrane Science"}
+{"title": "Directional Real-Time Optimization Applied to a Kite-Control Simulation Benchmark", "sha": "e0a3575b5e409c9af80b34a70d55c09d91527286", "authors": ["Sean Costello", "Gr\u00e9gory Fran\u00e7ois", "Dominique Bonvin", ")"], "doi": null, "journal": null}
+{"title": "An Experiment in Hierarchical Recognition of Group Activities using Wearable Sensors", "sha": "e0b997484c18db4398ead336087850ab10888a2e", "authors": ["Dawud Gordon", "Jan-Hendrik Hanne", "Martin Berchtold", "Takashi Miyaki", "Michael Beigl"], "doi": null, "journal": null}
+{"title": "How Deep Should It Be? On the Optimality of Hierarchical Architectures", "sha": "e0e8a89f29636d7a8b867d9bc2aea430d6362c11", "authors": ["Amihai Motro", "Alessandro D&apos;atri", "Eli Gafni"], "doi": null, "journal": null}
+{"title": "SCORE TESTS FOR FAMILIAL CORRELATION IN GENOTYPED-PROBAND DESIGNS", "sha": "e11b4fb7ecfa071fc5ca8bc22f34c926d3724ad9", "authors": ["R Carroll", "M Gail", "J Benichou", "C Galindo", "D Pee"], "doi": null, "journal": null}
+{"title": "Wild type human TDP-43 potentiates ALS-linked mutant TDP-43 driven progressive motor and cortical neuron degeneration with pathological features of ALS", "sha": "e129d9b54e99fa656095b587fa17cf677249d6b2", "authors": ["Jacqueline Mitchell", "Remy Constable", "Eva So", "Caroline Vance", "Emma Scotter", "Leanne Glover", "Tibor Hortobagyi", "Eveline Arnold", "Shuo-Chien Ling", "Melissa Mcalonis", "Sandrine Da Cruz", "Magda Polymenidou", "Lino Tessarolo", "Don Cleveland", "Christopher Shaw"], "doi": "10.1186/s40478-015-0212-4", "journal": "Acta Neuropathologica Communications"}
+{"title": "Contributions to Mineralogy and Petrology Formation mechanisms of illite, chlorite and mixed-layer illite-chlorite in Triassic volcanogenic sediments from the Southland Syncline, New Zealand*", "sha": "e1546a48044eef921731ebdedaf5cd7b9650abf0", "authors": ["Jung Ho Abn L&apos;", "Donald Peacor", "Douglas Coombs"], "doi": null, "journal": null}
+{"title": "Interaction of an atom with a small dispersive and absorptive dielectric body", "sha": "e15db31475d4f29a5e4c8543bda0156825289beb", "authors": ["Claudia Eberlein", "Maciej Janowicz"], "doi": "10.1103/physreva.67.063816", "journal": "Physical Review A"}
+{"title": "Competitive Concurrent Distributed Queuing", "sha": "e1f0af54b1ddb555a06f1609ed6bee816e48c247", "authors": ["Maurice Herlihy", "Srikanta Tirthapura", "Roger Wattenhofer"], "doi": null, "journal": null}
+{"title": null, "sha": "e304d01cc502dbcb7b082b24ca0b846a80b4e6bc", "authors": [], "doi": null, "journal": null}
+{"title": "Mesenchymal stem cells mediate the clinical phenotype of inflammatory breast cancer in a preclinical model", "sha": "e33da9ec056ea4a6dd845d25f378ac24241fd062", "authors": ["Lara Lacerda", "Bisrat Debeb", "Daniel Smith", "Richard Larson", "Travis Solley", "Wei Xu", "Savitri Krishnamurthy", "Yun Gong", "Lawrence Levy", "Thomas Buchholz", "Naoto Ueno", "Ann Klopp", "Wendy Woodward"], "doi": "10.1186/s13058-015-0549-4", "journal": "Breast Cancer Research"}
+{"title": "An Interface Design Methodology: Scenario Based Design Extended for Diverse Computer User Groups", "sha": "e346940f6515e60f36fc7d08de757300bbccad61", "authors": ["Kayenda Johnson"], "doi": null, "journal": null}
+{"title": "Greening wireless communications: Status and future directions", "sha": "e3c8a84b1e93bda5585645c2869e388ef27fdc9d", "authors": ["Pablo Serrano", "Antonio De La Oliva", "Paul Patras", "Vincenzo Mancuso", "Albert Banchs"], "doi": "10.1016/j.comcom.2012.06.011", "journal": "Computer Communications"}
+{"title": "Neural Circuit Assembly: Economically Wired by a Single Cadherin", "sha": "e3c8f9e3e55ea877222d479e90b5d746e6b92ba9", "authors": ["Richard Kaschula", "Iris Salecker"], "doi": "10.1016/j.cub.2014.04.038", "journal": "Current Biology"}
+{"title": "Measuring the total economic value of restoring ecosystem services in an impaired river basin: results from a contingent valuation survey", "sha": "e4186b409dc60dd153ca871250aa5abff43a20f1", "authors": ["John Loomis", "Paula Kent", "Liz Strange", "Kurt Fausch", "Alan Covich"], "doi": null, "journal": "Ecological Economics"}
+{"title": "P O B o x 1 1 7 2 2 1 0 0 L u n d + 4 6 4 6-2 2 2 0 0 0 0 Windows In: Empirical Evidence of Construals of Spatial Meaning The Construal of Spatial Meaning Windows into Conceptual Space", "sha": "e4321dd21592b013653fa68d1f0fbec95e1c77be", "authors": ["Carita Paradis", "Jean Hudson", "Magnusson"], "doi": null, "journal": null}
+{"title": "Mechanics of the human red blood cell deformed by optical tweezers", "sha": "e48169ad11d9224530158b5acf81297f50f43ed5", "authors": ["M Dao", "C Lim", "S Suresh"], "doi": "10.1016/j.jmps.2003.09.019", "journal": "Journal of the Mechanics and Physics of Solids"}
+{"title": "Infrastructure Aid, Deindustrialization, and Welfare", "sha": "e4cbf84f9908178e597455e3202f9ffe7c298ca0", "authors": ["E Kwan Choi"], "doi": null, "journal": null}
+{"title": "Effect of traumatic acid on antioxidant activity in Chlorella vulgaris (Chlorophyceae)", "sha": "e546d1c06c234baacc1fb0b1ff3b0659805e1c84", "authors": ["Anna Pietryczuk", "Romuald Czerpak"], "doi": "10.1007/s10725-011-9599-5", "journal": "Plant Growth Regulation"}
+{"title": "Micromagnetic simulation of the pinning and depinning process in permanent magnets", "sha": "e566c063e98077f36efc2e0be7c6c3f9b57e87d0", "authors": ["W Scholz", "T Schrefl", "J Fidler", "T Matthias", "D Suess", "V Tsiantos"], "doi": "10.1109/tmag.2003.815747", "journal": "IEEE Transactions on Magnetics"}
+{"title": "Evaluation of case management of uncomplicated malaria in Haiti: a national health facility survey, 2012", "sha": "e5f60ba6da2c039894bb44fe407f0a95c7ca02d3", "authors": ["Keren Landman", "Samuel Jean", "Alexandre Existe", "Eniko Akom", "Michelle Chang", "Jean Lemoine", "Kimberly Mace"], "doi": "10.1186/s12936-015-0901-2", "journal": "Malaria Journal"}
+{"title": "Complex Source Radiation in a Cylindrical Radome of Metal-Dielectric Grating", "sha": "e628bbec61ab955408c36fd157a6e6b300a8ce0b", "authors": ["Ayhan Alt\u0131nta\u00b8s", "Alt\u0131nta\u00b8s", "Slim Ouardani", "Vladimir Yurchenko"], "doi": null, "journal": "IEEE TRANSACTIONS ON ANTENNAS AND PROPAGATION"}
+{"title": "AA amyloidosis associated with macroglobulinemia", "sha": "e6320352719e48949dd95748fce231f56bab0203", "authors": ["Raine Tatara", "Tadashi Nagai", "Hiroyuki Kobayashi", "Kaoru Hatano", "Takahiro Suzuki", "Kazuo Muroi", "Keiya Ozawa"], "doi": "10.1007/s12185-010-0700-z", "journal": "International Journal of Hematology"}
+{"title": "Masturbation is Related to Psychopathology and Prostate Dysfunction: Comment on Quinsey (2012)", "sha": "e634311c9170674fd734aaf8779153205ab5238f", "authors": ["Rui Miguel Costa"], "doi": "10.1007/s10508-012-9956-0", "journal": "Archives of Sexual Behavior"}
+{"title": "Ontology Assisted Crowd Mining", "sha": "e65d4eac06f9d74154db0713ed48a7f64df89ab4", "authors": ["Yael Amsterdamer", "Susan Davidson", "Tova Milo", "Slava Novgorodov", "Amit Somech"], "doi": null, "journal": null}
+{"title": null, "sha": "e65f8c840bd9d0ca6fe2152bf91e66e63ec58a55", "authors": ["James Johnston", "Eric Ruthruff", "James Johnston", "Eric Ruthruff"], "doi": null, "journal": "Journal of Experimental Psychology: Human Perception and Performance"}
+{"title": "An update of sentinel lymph node mapping in patients with ductal carcinoma in situ", "sha": "e673ccedcc6244856b494445b28326f88dcaa75c", "authors": ["Caren Wilkie", "Laura White", "Elisabeth Dupont", "Alan Cantor", "Charles Cox"], "doi": "10.1016/j.amjsurg.2005.06.011", "journal": "The American Journal of Surgery"}
+{"title": "Structure and Dynamics of Annexin 12 Bound to a Planar Lipid Bilayer", "sha": "e6a0c4b05013d0be043f655ecaf0bd4ceefb6067", "authors": ["T Risse", "W Hubbell", "J Isas", "H Haigler"], "doi": "10.1103/physrevlett.91.188101", "journal": "Physical Review Letters"}
+{"title": "THE INFRARED SPECTRAL ENERGY DISTRIBUTION OF NORMAL STAR-FORMING GALAXIES: CALIBRATION AT FAR-INFRARED AND SUBMILLIMETER WAVELENGTHS", "sha": "e6f0b10cdfa9602d658f32f1d7542206508b8cc3", "authors": ["Daniel Dale", "George Helou"], "doi": null, "journal": null}
+{"title": "A CASE OF HEPATIC ANGIOSARCOMA SUPPLIED BY BOTH HEPATIC ARTERY AND PORTAL VEIN KASUKA W A 4) and YUKIO SA T05)", "sha": "e70a1423e9896c9c290d76334b0f24bf8b283cf5", "authors": [], "doi": null, "journal": null}
+{"title": "Protein domain decomposition using a graph-theoretic approach", "sha": "e720af0627bf52b8a46b94929644d7f296796e46", "authors": ["Ying Xu", "Dong Xu", "Harold Gabow"], "doi": null, "journal": "BIOINFORMATICS"}
+{"title": "Towards Analog and Mixed-Signal SOC Design with SystemC-AMS", "sha": "e80282c38bb0cf8665ef0d1819921de6ae0a1a33", "authors": ["Alain Vachoux", "Christoph Grimm", "Karsten Einwich"], "doi": null, "journal": null}
+{"title": "Perfomance Models for Blocked Sparse Matrix-Vector Multiplication kernels", "sha": "e80cd796e3e26296ab081e15a4a83cd81b952917", "authors": ["Vasileios Karakasis", "Georgios Goumas", "Nectarios Koziris"], "doi": null, "journal": null}
+{"title": "Electrifying Roller-Coaster Ride through Speed Breakers", "sha": "e865f4f65c20b044e79fcb08c86a124f4c4dc127", "authors": ["Amod Kumar", "Pandey Srmswcet", "Somya Yadav", "Tanu Srivastava"], "doi": null, "journal": "International Journal of Computer Applications"}
+{"title": "Mechanism of ubiquitylation by dimeric RING ligase RNF4", "sha": "e87aecd7bdfdac5968dd6b981a49917146ca516f", "authors": ["Anna Plechanovov\u00e1", "Ellis Jaffray", "Stephen Mcmahon", "Kenneth Johnson", "Iva Navr\u00e1tilov\u00e1", "James Naismith", "Ronald Hay"], "doi": "10.1038/nsmb.2108", "journal": "Nature Structural & Molecular Biology"}
+{"title": "CONSTRAINTS TO PEER SCAFFOLDING LIMITA\u00c7\u00d5ES NA COLABORA\u00c7\u00c3O ENTRE PARES", "sha": "e921b9d8cf1f8d3f5e72b2af67e1bad4fc1cef4e", "authors": ["Trab", "Ling", "Aplic", "Campinas"], "doi": null, "journal": "Trab. Ling. Aplic"}
+{"title": "Tungsten disulphide coated multi-walled carbon nanotubes", "sha": "e980445cfb1a113095a4a04b0cbb2254a8f2aa68", "authors": ["R Whitby", "W Hsu", "C Boothroyd", "H Kroto", "D Walton"], "doi": null, "journal": null}
+{"title": "Optimal design of commercial vehicle systems using analytical target cascading", "sha": "e98fc3c58b67c3709b442d9c77dbb2305ad32106", "authors": ["Namwoo Kang", "Michael Kokkolaras", "Panos Papalambros", "Seungwon Yoo", "Wookjin Na", "Jongchan Park", "Dieter Featherman"], "doi": "10.1007/s00158-014-1097-8", "journal": "Structural and Multidisciplinary Optimization"}
+{"title": "Effect of single alien chromosome from shallot ( Allium cepa L. Aggregatum group) on carbohydrate production in leaf blade of bunching onion ( A. fistulosum L.)", "sha": "e9a1de7171a538437bc06a2c6c67f2eb1c1ff84d", "authors": ["Tran Thi", "Minh Hang", "Masayoshi Shigyo", "Shigenori Yaguchi", "Naoki Yamauchi", "Yosuke Tashiro"], "doi": null, "journal": "Genes Genet. Syst"}
+{"title": "Maternal Care and DNA Methylation of a Glutamic Acid Decarboxylase 1 Promoter in Rat Hippocampus", "sha": "e9e863fdc9d6a3d4563649bdd782e7c25da12337", "authors": ["T-Y Zhang", "I Hellstrom", "R Bagot", "X Wen", "J Diorio", "M Meaney"], "doi": "10.1523/jneurosci.1039-10.2010", "journal": "Journal of Neuroscience"}
+{"title": "Clinical utility of CT in children with persistent focal chest abnormality", "sha": "ea6d6d3f190995c2d9813a4cfb47becfc9f5de41", "authors": ["S Montella"], "doi": "10.1183/09031936.05.00082205", "journal": "European Respiratory Journal"}
+{"title": "An Energy-Efficient Differential Flip-Flop for Deeply Pipelined Systems", "sha": "ea772b62de1d1e01cbf8778b2be57482d27fe591", "authors": ["Mitchell Myjak", "Jos\u00e9 Delgado-Frias", "Seon Kwang", "Jeon"], "doi": null, "journal": null}
+{"title": "Is the WHO 90:10 Prostate-Specific Antigen (PSA) First International Reference Standard Really 90% 1-Antichymotrypsin-Bound PSA and 10% Free PSA?", "sha": "eae9e94e9b77cedb8b592fc878477ba5de2c8fdf", "authors": ["L Sokoll", "S Rosenwald", "J Lyons", "D Elliott", "D Chan"], "doi": "10.1373/clinchem.2011.172726", "journal": "Clinical Chemistry"}
+{"title": "Intraoperative electrochemotherapy of colorectal liver metastases", "sha": "eb12560dd540655789ba3b4c0b5750b77b5916b0", "authors": ["Ibrahim Edhemovic", "Erik Brecelj", "Gorana Gasljevic", "Maja Marolt Music", "Vojka Gorjup", "Barbara Mali", "Tomaz Jarm", "Bor Kos", "Denis Pavliha", "Biljana Grcar Kuzmanov", "Maja Cemazar", "Marko Snoj", "Damijan Miklavcic", "Eldar Gadzijev", "Gregor Sersa"], "doi": "10.1002/jso.23625", "journal": "Journal of Surgical Oncology"}
+{"title": "Metabolic scaling theory in plant biology and the three oxygen paradoxa of aerobic life", "sha": "eb4346d501a25bbeda2ed17a96fd21b47dc253b0", "authors": ["Ulrich Kutschera", "Karl Niklas"], "doi": "10.1007/s12064-013-0194-3", "journal": "Theory in Biosciences"}
+{"title": "Computing Bounded Reach Sets from Sampled Simulation Traces *", "sha": "eb6a8eb2f133cd5e73dd7dcefe1cd2e275d1b9b6", "authors": ["Zhenqi Huang", "Sayan Mitra"], "doi": null, "journal": null}
+{"title": "Central Bank Independence and Transparency: Evolution and Effectiveness", "sha": "eb7962e0daf2935435dc19298f8dd0225dada6f1", "authors": ["Christopher Crowe", "Ellen Meade"], "doi": null, "journal": null}
+{"title": "The new animal phylogeny: Reliability and implications", "sha": "eb9b9b19d7dfa85250362558ffc228f45da20ff6", "authors": ["Andr\u00e9 Adoutte", "Guillaume Balavoine", "Nicolas Lartillot", "Olivier Lespinet", "Benjamin Prud&apos;homme", "Renaud De Rosa"], "doi": null, "journal": null}
+{"title": "Effects of far-infrared sauna bathing on recovery from strength and endurance training sessions in men", "sha": "ebfa5e1a70dcbd5138642516987a8122eafdc966", "authors": ["Antti Mero", "Jaakko Tornberg", "Mari M\u00e4ntykoski", "Risto Puurtinen"], "doi": "10.1186/s40064-015-1093-5", "journal": "SpringerPlus"}
+{"title": "Symbolic path-based protocol verification", "sha": "ec5dde0cae8987b9fdd0ce5277d43c2875afc0e1", "authors": ["Wen-Chien Liu", "Chyan-Goei Chung"], "doi": null, "journal": null}
+{"title": "Deriving Information about Architecture from Activity Patterns in Coupled Cell Systems", "sha": "eca4739f083eb4efcbd90295d752532a16b92346", "authors": ["Kresimir Josic", "Jonathan Rubin"], "doi": "10.1137/040607587", "journal": "SIAM Journal on Applied Dynamical Systems"}
+{"title": "Tabu \u00b7 Local Search Mechanism for Mega Process Genetic Algorithm", "sha": "ed310b267053e3007566ebea5417c4c6c3c40e2e", "authors": ["Yoshiko Hanada", "Tomoyuki Hiroyasu", "Mitsunori Miki"], "doi": null, "journal": null}
+{"title": "Dielectric Study of Allyl Chloride with 2-Pentanone and 2-Hexanone in Microwave Frequency Range", "sha": "ed5dd319d8aba8b24bd58fe1ea9768c60d6cf7b9", "authors": ["Yuvraj Sudake", "Siddharth Kamble", "Aruna Maharolkar", "Sunil Patil", "Prakash Khirade", "Suresh Mehrotra"], "doi": "10.5012/bkcs.2012.33.10.3423", "journal": "Bulletin of the Korean Chemical Society"}
+{"title": "Bank lending policy, credit scoring and value-at-risk", "sha": "ed5fde3673658f99edf896c2389215fb2dfa7db8", "authors": ["Tor Jacobson", "Kasper Roszbach"], "doi": "10.1016/s0378-4266(01)00254-0", "journal": "Journal of Banking & Finance"}
+{"title": "PROPIEDADES TELEOL\u00d3GICAS Y SUPERVENIENCIA'*", "sha": "ed61f953bdc66b398476ffa5b5538ba3a08c7629", "authors": ["Manuel Otero"], "doi": null, "journal": null}
+{"title": "Yellow leaf of sugarcane is caused by at least three different genotypes of sugarcane yellow leaf virus, one of which predominates on the Island of R\u00e9union", "sha": "ed696f4a0899e67d343db0626fb53a3dbab29336", "authors": ["Y Abu Ahmad", "L Rassaby", "M Royer", "Z Borg", "K Braithwaite", "T Mirkov", "M Irey", "X Perrier", "G Smith", "P Rott"], "doi": "10.1007/s00705-005-0712-9", "journal": "Archives of Virology"}
+{"title": "The solid-liquid interfacial free energy out of equilibrium Supplementary information", "sha": "edcc8d694cfb89e3ed8b7598fa7401a175d162b8", "authors": ["Bingqing Cheng", "Gareth Tribello", "Michele Ceriotti"], "doi": null, "journal": null}
+{"title": "PRESERVATION OF THE OLFACTORY TRACT IN BIFRONTAL CRANIOTOMY", "sha": "ede6e6f881251fabcd62fb2b42dff8c539812a5f", "authors": ["Paulo Aguiar", "Guilherme Pulici", "Leonardo Lourenco", "Juan Flores", "Valter Cescato", "Henrique Paulo", "Maestro Aguiar -Rua", "Torquato"], "doi": null, "journal": "Arq Neuropsiquiatr"}
+{"title": "SALAZAR'S INTERFERENCE IN THE BBC PORTUGUESE SERVICE DURING WORLD WAR II", "sha": "edfcae4246fe5488d13226623a4afc7d138d2ead", "authors": ["Nelson Ribeiro"], "doi": null, "journal": null}
+{"title": "Les \u00ab tumeurs superficielles de vessie \u00bb n\u2019existent plus", "sha": "ee01687eda02a6ead84d915ef63a78cb90bf3e2c", "authors": ["J Irani", "S Bernardini", "J-L Davin", "L Guy", "C Mazerolles", "Christian Pfister", "M Roupret", "C Roy", "F Rozet", "F Saint", "C Th\u00e9odore", "H Wallerand"], "doi": "10.1016/j.purol.2008.03.002", "journal": "Progr\u00e8s en Urologie"}
+{"title": "What can be learnt from analysing insect orientation flights using probabilistic SLAM?", "sha": "ee8486e840f16483738ec1910939fa62ec69e051", "authors": ["Bartholomew Baddeley", "Andrew Philippides", "Paul Graham", "Natalie De Ibarra", "Thomas Collett", "Phillip Husbands"], "doi": "10.1007/s00422-009-0327-4", "journal": "Biological Cybernetics"}
+{"title": "PPP \u2013 policies, practices and problems in Ghana's urban water supply", "sha": "eea6d1a429b2953ebe528e1bae93a9580e081bcf", "authors": ["Veronika Fuest", "Stefan Haffner"], "doi": "10.2166/wp.2007.060", "journal": "Water Policy"}
+{"title": null, "sha": "eeb8e6e99a475e69c1c510a202174f1f64631b56", "authors": [], "doi": null, "journal": null}
+{"title": "Finite Element Analysis of the Modified Ring Test for Determining Mode I Fracture Toughness", "sha": "eee3e6ea5bb67273912af58887227618fc30e3a9", "authors": ["M Fischer?", "~ Elsworth", "R Alley~", "T Engelder"], "doi": null, "journal": "Int. J. Rock Mech. Min. Sci. & Geomech. Abstr"}
+{"title": "Estudo da Microestrutura Formada no Processo de Soldagem por Atrito em A\u00e7o C-Mn com Pino Consum\u00edvel Microstructural evaluation of a C-Mn steel welded by the Friction Hidro-Pillar Process", "sha": "ef2dba85f589f11b67263097e66e7191ca2cdea0", "authors": ["Insp Soldag", "S\u00e3o Paulo"], "doi": null, "journal": null}
+{"title": "Alerting and orienting of attention without visual awareness", "sha": "ef459dbe36d5e1ea4d208003b4723407f1e9e92c", "authors": ["Shena Lu", "Yongchun Cai", "Mowei Shen", "Ying Zhou", "Shihui Han"], "doi": "10.1016/j.concog.2012.03.012", "journal": "Consciousness and Cognition"}
+{"title": "FORTRAN PROGRAMS FOR CALCULATING CONNECTIVITY OF THREE-DIMENSIONAL NUMERICAL MODELS AND FOR RANKING MULTIPLE REALIZATIONS", "sha": "efbd0d24d959f8dfa8e11fb71058b57818c229ab", "authors": ["Clayton Deutsch"], "doi": null, "journal": null}
+{"title": "Genetic variation links creativity to psychiatric disorders", "sha": "f027644dccfc96081fe34e1ef0f183985131cf4b", "authors": ["Matthew Keller", "Peter Visscher"], "doi": null, "journal": null}
+{"title": null, "sha": "f03610385d240c8a8c46b9e81ceed17d8bb82277", "authors": [], "doi": null, "journal": null}
+{"title": "Christian Richter", "sha": "f047c5a1e547e517d9ea4088d2bdb730087dd1d6", "authors": [], "doi": null, "journal": null}
+{"title": "Evaluation of Interpretation Strategies and Significant Bronchodilator Response in Pediatric Patients With Normal Baseline Spirometry", "sha": "f063aab132accd9a22d75fc46e6f03b85e834c9a", "authors": ["Daniel Hsu", "Thad Ocampo", "Heather Digiovanni", "Eddie Gil"], "doi": "10.4187/respcare.01709", "journal": "Respiratory Care"}
+{"title": "Phase formation and characterization of BaBi2Ta2O9 obtained by mixed oxide procedure", "sha": "f08e450c8d3d432348ec3de3dde626611c7576f7", "authors": ["G Da Costa", "A Sim\u00f5es", "A Ries", "C Foschini", "M Zaghete", "J Varela"], "doi": "10.1016/j.matlet.2003.11.019", "journal": "Materials Letters"}
+{"title": "ECOSYSTEM EFFECTS OF BIODIVERSITY MANIPULATIONS IN EUROPEAN GRASSLANDS", "sha": "f0edf27a49ed5806555d4c9b7284f4d69bdaa5c0", "authors": ["E Spehn", "A Hector", "J Joshi", "M Scherer-Lorenzen", "B Schmid", "E Bazeley-White", "C Beierkuhnlein", "M Caldeira", "M Diemer", "P Dimitrakopoulos", "J Finn", "H Freitas", "P Giller", "J Good", "R Harris", "P Ho\u00a8gberg", "Ho\u00a8 Ho\u00a8gberg", "K Huss-Danell", "A Jumpponen", "J Koricheva", "P Leadley", "M Loreau", "A Minns", "C Mulder", "G O&apos;donovan", "S Otway", "C Palmborg", "J Pereira", "A Pfisterer", "A Prinz", "D Read", "E.-D Schulze", "A.-S Siamantziouras", "A Terry", "A Troumbis", "F Woodward", "S Yachi", "J Lawton"], "doi": null, "journal": "Ecological Monographs"}
+{"title": "Application-Specific Customization and Scalability of Soft Multiprocessors", "sha": "f11a221bbf3b4858fd08725827ad89b3ab65517a", "authors": ["Deepak Unnikrishnan", "Jia Zhao", "Russell Tessier"], "doi": null, "journal": null}
+{"title": "Application and comparison of wind speed sampling methods for wind generation in reliability studies using non-sequential Monte Carlo simulations", "sha": "f11f44afe96169cf27cbcda300baaa18a52f47fe", "authors": ["F Vall\u00e9e", "J Lobry", "O Deblecker"], "doi": null, "journal": null}
+{"title": "Herbivory and climatic warming: a Mediterranean outbreaking caterpillar attacks a relict, boreal pine species\u00b4\u00b4 species\u00b4 species\u00b4\u00b4", "sha": "f167330dfb41f72fb7b3220703cf28e361629473", "authors": ["Jose Hodar", "Regino Zamora\u00b4\u00b4", "Zamora\u00b4 Zamora\u00b4\u00b4"], "doi": null, "journal": null}
+{"title": "Approximate Completed Trace Equivalence of Three Dimensional t-Model Nonlinear Algebraic Hybrid Systems", "sha": "f1755779197ed17483f776a3a9627638192d0123", "authors": ["Hao Yang", "Jinzhao Wu", "Zhiwei Zhang", "Yang Liu"], "doi": "10.12785/amis/070506", "journal": "Applied Mathematics & Information Sciences"}
+{"title": "Graphene oxide as a chemically tunable platform for optical applications", "sha": "f1d434a128c32e5eb27033f699b339423f24786c", "authors": ["Kian Loh", "Qiaoliang Bao", "Goki Eda", "Manish Chhowalla"], "doi": "10.1038/nchem.907", "journal": "Nature Chemistry"}
+{"title": "IMPLEMENTATION OF FINITE DIFFERENCE SCHEMES FOR THE WAVE EQUATION ON FPGA", "sha": "f24ce5f580ee287e8a2b69f7a1f9899b052a5f9b", "authors": ["E Motuk", "R Woods", "S Bilbao"], "doi": null, "journal": null}
+{"title": "Spherical Bullet Formation via E-cadherin Promotes Therapeutic Potency of Mesenchymal Stem Cells Derived From Human Umbilical Cord Blood for Myocardial Infarction", "sha": "f25ede5132fd7a400ba963d5dfc2b61ebe637490", "authors": ["Eun Lee", "Sung Park", "Soo Kang", "Gi-Hwan Kim", "Hyun-Jae Kang", "Sae-Won Lee", "Hong Jeon", "Hyo-Soo Kim"], "doi": "10.1038/mt.2012.58", "journal": "Molecular Therapy"}
+{"title": "Ecology and conservation of common bottlenose dolphinsTursiops truncatusin the Mediterranean Sea", "sha": "f2bd6d365ad7a1cf5b1b4f93ecd29517ff684fb2", "authors": ["Giovanni Bearzi", "Caterina Fortuna", "Randall Reeves"], "doi": "10.1111/j.1365-2907.2008.00133.x", "journal": "Mammal Review"}
+{"title": "Neutrophil chemokines in bronchoalveolar lavage fluid and leukocyte-conditioned medium from nonsmokers and smokers", "sha": "f3c8fda8d2a21145380f37555b5f91c425aff08a", "authors": ["D Morrison", "R Strieter", "S Donnelly", "M Burdick", "S Kunkel", "W Macnee"], "doi": "10.1183/09031936.98.12051067", "journal": "European Respiratory Journal"}
+{"title": "Preoperative hyperfractionated accelerated radiotherapy (HART) and concomitant CPT-11 in locally advanced rectal carcinoma: A Phase I study", "sha": "f45150aee38cf4f897370abe5268b4f3ec659f78", "authors": ["Verena Voelter", "Roger Stupp", "Maurice Matter", "Michel Gillet", "Hanifa Bouzourene", "Serge Leyvraz", "Philippe Coucke"], "doi": "10.1016/s0360-3016(03)00326-2", "journal": "International Journal of Radiation Oncology*Biology*Physics"}
+{"title": "Autonomous growth of BALB/MK keratinocytes transfected with a retroviral vector carrying the human epidermal growth factor gene", "sha": "f4aaabcc5cb42d039230729f9df21cd975c277a9", "authors": ["Jomuna Choudhuri", "Monica Mathor", "Fl\u00e1via Silva", "Sang Han"], "doi": null, "journal": null}
+{"title": "An Algorithm for Maximizing Expected Log Investment Return 369", "sha": "f51017714534dc649c8b53c90c80aef4a3481d32", "authors": ["Thomas Cover"], "doi": null, "journal": "IEEE TRANSACTIONS ON INFORMATION THEORY"}
+{"title": "Molecular definition of heterogeneous nuclear ribonucleoprotein R (hnRNP R) using autoimmune antibody: immunological relationship with hnRNP P", "sha": "f52e97904ccc2725382f93d001631857d2142b20", "authors": ["Wolfgang Hassfeld", "Edward Chan", "David Mathison", "Douglas Portman", "Gideon Dreyfuss", "G\u00fcnter Steiner", "Eng Tan", "W Keck"], "doi": null, "journal": "Nucleic Acids Research"}
+{"title": "Little Emperors in the UK: acculturation and food over time", "sha": "f551fd60f6e345da3b03dbc089ed8ae2358f0c81", "authors": ["Benedetta Cappellini", "Dorothy Ai-Wan Yen", "Royal Holloway", "Dorothy Yen"], "doi": null, "journal": "Journal of Business Research"}
+{"title": "Imaging capability of pseudomorphic high electron mobility transistors, AlGaN / GaN, and Si micro-Hall probes for scanning Hall probe microscopy between 25 and 125 \u00b0 C", "sha": "f5751f540db2c99c435b200bca65bd8f167c695c", "authors": ["R Akram", "M Dede", "A Oral"], "doi": null, "journal": null}
+{"title": "An English-Arabic Bi-directional Machine Translation Tool in the Agriculture Domain A Rule-Based Transfer Approach for Translating Expert Systems", "sha": "f5b17dd68c18848901c6a7099a099b68fa6f3014", "authors": ["Khaled Shaalan", "Ashraf Hendam", "Ahmed Rafea"], "doi": null, "journal": "IFIP AICT"}
+{"title": "The Netherlad JOURNAL OF MEDICINE ELSEVIER", "sha": "f5ccedd447840d24a1adbdd93c998f8b8fa99c65", "authors": ["S Schalm"], "doi": null, "journal": "Netherlands Journal of Medicine"}
+{"title": "A Model of Dynamic Auditory Perception and Its Application to Robust Word Recognition", "sha": "f6dfa59215fb07b1646a8799e5183bf36f00cf54", "authors": ["Brian Strope", "Abeer Alwan"], "doi": null, "journal": "IEEE TRANSACTIONS ON SPEECH AND AUDIO PROCESSING"}
+{"title": "Fundamental Tradeoffs in Vehicular Ad Hoc Networks", "sha": "f6e140e4653c7f4867afb266685d552981668fc3", "authors": ["Mohammad Nekoui", "Hossein Pishro-Nik"], "doi": null, "journal": null}
+{"title": "On computing the entropy of cellular automata", "sha": "f7231202fa47261ae331033d74e2120dc4878a44", "authors": ["Michele D&apos;amico", "Giovanni Manzini", "Luciano Margara"], "doi": null, "journal": "Theoretical Computer Science"}
+{"title": "Initial spread of the invasive green alga Caulerpa verticillata over coral reef communities in the Gulf of California", "sha": "f7601e6e94fc4f863fbb61f71877932d12556c0d", "authors": ["C P\u00e9rez-Estrada", "R Rodr\u00edguez-Estrella", "D Palacios-Salgado", "D Paz-Garc\u00eda"], "doi": "10.1007/s00338-013-1045-x", "journal": "Coral Reefs"}
+{"title": "Editorial: Evaluating Knowledge Engineering Techniques", "sha": "f77a6913c513bd27de190b1a309c40349ce06d13", "authors": ["Tim Menzies", "Frank Van Harmelen"], "doi": null, "journal": "International Journal of Human Computer Studies, Special Issue on Evaluation of Knowledge Engineering Techniques"}
+{"title": "Planar graph classes with the independent set problem solvable in polynomial time", "sha": "f789c4ddf84493555f6c95f1842105be104c2d25", "authors": ["V Alekseev", "D Malyshev"], "doi": "10.1134/s1990478909010013", "journal": "Journal of Applied and Industrial Mathematics"}
+{"title": "Comparison of Software Architecture Reverse Engineering Methods", "sha": "f7be66f7da628006696bdc0702d76dd0c8b1ae06", "authors": ["C Stringfellow", "C Amory", "D Potnuri", "A Andrews", "M Georg"], "doi": null, "journal": null}
+{"title": "Corrosion inhibition by aerobic biofilms on SAE 1018 steel", "sha": "f7cd8b336a0d135cc8ec5b577e7aec032b63a1b3", "authors": [], "doi": null, "journal": null}
+{"title": "The use of non-destructive measurement and physiological models of yield determination to investigate factors determining differences in seed yield between genotypes of \"desi\" chickpeas (Cicer arietum)t", "sha": "f824598d76be2a195a29008252d1df198bfb5cdb", "authors": ["J Williams", "N Saxena"], "doi": null, "journal": "Ann. appl. B i d"}
+{"title": "A New DLL-Based Approach for All-Digital Multiphase Clock Generation", "sha": "f85e251669faba289436ed344ebbf897ddf85f3a", "authors": ["C-C Chung", "C-Y Lee"], "doi": "10.1109/jssc.2003.822890", "journal": "IEEE Journal of Solid-State Circuits"}
+{"title": "Self-reported fever and measured temperature in emergency department records used for syndromic surveillance: Table 1", "sha": "f906abbcabba0690e3649209125b05b13a33e6c7", "authors": ["Taha Kass-Hout", "David Buckeridge", "John Brownstein", "Zhiheng Xu", "Paul Mcmurray", "Charles Ishikawa", "Julia Gunn", "Barbara Massoudi"], "doi": "10.1136/amiajnl-2012-000847", "journal": "Journal of the American Medical Informatics Association"}
+{"title": "Female gender increases stiffness of elastic but not of muscular arteries in type I diabetic patients", "sha": "f997fdcd75d9f2bdcc0ab6321d4ab4b0d62dc8d0", "authors": ["A Ahlgren", "G Sundkvist", "T Sandgren", "T Lanne"], "doi": "10.1046/j.1475-097x.2002.00451.x", "journal": "Clinical Physiology and Functional Imaging"}
+{"title": "Measuring growth of a phenanthrene-degrading bacterial inoculum in soil with a quantitative competitive polymerase chain reaction method", "sha": "fa5e28a6d84ba198a4597bd3958ca043d0d236f5", "authors": ["Egbert Schwartz", "Sinh Trinh", "Kate Scow"], "doi": null, "journal": null}
+{"title": "The benefits of body weight loss on health-related quality of life", "sha": "faf33567051201e084ed73147f8b97a86afc13fe", "authors": ["Hsiang-Ju Pan", "Beatriz Cole", "Allan Geliebter"], "doi": "10.1016/j.jcma.2011.01.038", "journal": "Journal of the Chinese Medical Association"}
+{"title": "Cortical activity during tactile exploration of objects in blind and sighted humans", "sha": "fb49efedd1a153afd9d48d0039b98ddbab32f3fa", "authors": ["Amir Amedi", "Noa Raz", "Haim Azulay", "Rafael Malach", "Ehud Zohary"], "doi": "10.3233/RNN-2010-0497", "journal": "Restorative Neurology and Neuroscience"}
+{"title": "Influence of boundary conditions and size effect on the drift capacity of URM walls", "sha": "fbb95388581da620d91cb4ffd27c0ac4c7ca11e6", "authors": ["Sarah Petry", "Katrin Beyer"], "doi": "10.1016/j.engstruct.2014.01.048", "journal": "Engineering Structures"}
+{"title": "Designed amyloid fibers as materials for selective carbon dioxide capture", "sha": "fbc8cbb0b2743374d2966a12836c3d4e6574ae23", "authors": ["D Li", "H Furukawa", "H Deng", "C Liu", "O Yaghi", "D Eisenberg"], "doi": "10.1073/pnas.1321797111", "journal": "Proceedings of the National Academy of Sciences"}
+{"title": "Toward discovery science of human brain function", "sha": "fc3311286b6b171f0eaae9b6c6a70ba5b58d8e36", "authors": ["B Biswal", "M Mennes", "X-N Zuo", "S Gohel", "C Kelly", "S Smith", "C Beckmann", "J Adelstein", "R Buckner", "S Colcombe", "A-M Dogonowski", "M Ernst", "D Fair", "M Hampson", "M Hoptman", "J Hyde", "V Kiviniemi", "R Kotter", "S-J Li", "C-P Lin", "M Lowe", "C Mackay", "D Madden", "K Madsen", "D Margulies", "H Mayberg", "K Mcmahon", "C Monk", "S Mostofsky", "B Nagel", "J Pekar", "S Peltier", "S Petersen", "V Riedl", "S Rombouts", "B Rypma", "B Schlaggar", "S Schmidt", "R Seidler", "G Siegle", "C Sorg", "G-J Teng", "J Veijola", "A Villringer", "M Walter", "L Wang", "X-C Weng", "S Whitfield-Gabrieli", "P Williamson", "C Windischberger", "Y-F Zang", "H-Y Zhang", "F Castellanos", "M Milham"], "doi": "10.1073/pnas.0911855107", "journal": "Proceedings of the National Academy of Sciences"}
+{"title": "Flavonoids and the Risk of Oral and Pharyngeal Cancer: A Case-Control Study from Italy", "sha": "fc42da454ba17cccd362c72d33d70d7271d32e2e", "authors": ["M Rossi", "W Garavello", "R Talamini", "E Negri", "C Bosetti", "L Dal Maso", "P Lagiou", "A Tavani", "J Polesel", "L Barzan", "V Ramazzotti", "S Franceschi", "C La Vecchia"], "doi": "10.1158/1055-9965.epi-07-0168", "journal": "Cancer Epidemiology Biomarkers & Prevention"}
+{"title": "Accuracy of MUAC in the Detection of Severe Wasting With the New WHO Growth Standards", "sha": "fc5b3c87ee44687d4026b86db8f92f219c8163f5", "authors": ["M Fernandez", "P Delchevalerie", "M Van Herp"], "doi": "10.1542/peds.2009-2175", "journal": "PEDIATRICS"}
+{"title": "The Design and Implementation of Hierarchical Software Systems With Reusable Components \u2020", "sha": "fcc157fdb9a734a51056769b29d23e2cd7cffa2a", "authors": ["Don Batory", "Sean O&apos;malley"], "doi": null, "journal": "ACM Transactions on Software Engr. and Methodology"}
+{"title": "Thermal Distributions in Stellar Plasmas, Nuclear Reactions and Solar Neutrinos", "sha": "fcf81d95ffd2815318ee45f51636a1d1587df3be", "authors": ["M Coraddu", "G Kaniadakis", "A", "M Lissia", "G Mezzorani", "P Quarati"], "doi": null, "journal": "Brazilian Journal of Physics"}
+{"title": "Analysis of spatial structure of epidermal nerve entry point patterns based on replicated data", "sha": "fd3e8eb8f3ff38919388336f155a2de4696bbd92", "authors": ["M Myllym\u00e4ki", "I Panoutsopoulou", "A S\u00e4rkk\u00e4"], "doi": "10.1111/j.1365-2818.2012.03636.x", "journal": "Journal of Microscopy"}
+{"title": "A BOUNDARY VALUE PROBLEM FOR HERMITIAN HARMONIC MAPS AND APPLICATIONS", "sha": "fd3feb3b1279ce41a9cae80215739a09a1cc62b1", "authors": ["Jingyi Chen"], "doi": null, "journal": "PROCEEDINGS OF THE AMERICAN MATHEMATICAL SOCIETY"}
+{"title": "Invariance of long-term visual priming to scale, reflection, translation, and hemisphere", "sha": "fd93840a73a800d8f66895caf09af8bfab627c16", "authors": ["J\u00f3 Zsef Fiser", "Irving Biederman"], "doi": null, "journal": "Vision Research"}
+{"title": "Vasopressin versus Norepinephrine Infusion in Patients with Septic Shock", "sha": "fdc44a26fc090c06171d4e3a33fd4a96278c8a89", "authors": ["James Russell", "Keith Walley", "Joel Singer", "Anthony Gordon", "M", "B", "Paul H\u00e9bert", "D Cooper", "B", "Cheryl Holmes", "Sangeeta Mehta", "John Granton", "Michelle Storms", "Deborah Cook", "Jeffrey Presneill", "M", "B", "Dieter Ayers", "J", "K", "A", "St Paul&apos;s Hospital", "J", "K", "J", "A", "D", "; Ottawa", "Hospi-Tal"], "doi": null, "journal": null}
+{"title": "Original articles Relationship between anxiety, depression, and morbidity in adult asthma patients", "sha": "fdd7b113f15601df6d5ee228be864daf494ada6d", "authors": ["L Rimington", "D Davies", "D Lowe", "M Pearson"], "doi": null, "journal": null}
+{"title": "The American dream", "sha": "fe1b96869a0019ae1261fcbdf6a50455c5967581", "authors": ["Gilberto Camanho"], "doi": "10.1016/j.rboe.2015.07.001", "journal": "Revista Brasileira de Ortopedia (English Edition)"}
+{"title": "WIENER INDEX OF TREES OF GIVEN ORDER AND DIAMETER AT MOST", "sha": "fe431f26dd30e821138d9b22bde4ca1b3c76fbd0", "authors": ["Simon Mukwembi", "Tom\u00e1\u0161 Vetr\u00edk"], "doi": "10.1017/s0004972713000816", "journal": "Bulletin of the Australian Mathematical Society"}
+{"title": "An experimental test for gender differences in beneficent behavior", "sha": "fe8694ee57d4c302e471dba32d3659fd0450bdc6", "authors": ["Gary Bolton", "Elena Katok"], "doi": null, "journal": null}
+{"title": "Inhibition of murine nephritogenic effector T cells by a clone-specific suppressor factor.", "sha": "feb3a54218e15a46899e217be5087b34e5b009d6", "authors": ["C Meyers", "C Kelly"], "doi": "10.1172/jci117564", "journal": "Journal of Clinical Investigation"}
+{"title": "Arabic Text Classification Framework Based on Latent Dirichlet Allocation", "sha": "fefb8d754de579a31c780343d9286cb16f317c51", "authors": ["Mounir Zrigui", "Rami Ayadi", "Mourad Mars", "Mohsen Maraoui"], "doi": null, "journal": "Journal of Computing and Information Technology-CIT"}
+{"title": "Meaning, Truth and Phenomenology", "sha": "ff1d6c24f945e6ff8dabf87d34f03a2d9f008bbb", "authors": ["Mark Bevir"], "doi": null, "journal": null}
+{"title": "An improved algorithm for optimal lightpath establishment on a tree topology", "sha": "ff55b60fee69c2dca318934080e4d9446fd3020b", "authors": ["Guoliang Xue", "Weiyi Zhang", "Jian Tang", "K Thulasiraman"], "doi": "10.1109/jsac-ocn.2006.22605", "journal": "IEEE Journal on Selected Areas in Communications"}
+{"title": "Ten years of maturation of endoscopic surgery in children. Is the wine good?", "sha": "ff7eeff4190ccaf23bb3bf83a1d4a3b224d640fd", "authors": ["N Bax"], "doi": "10.1016/j.jpedsurg.2003.10.016", "journal": "Journal of Pediatric Surgery"}
+{"title": "Specific effects of EEG based neurofeedback training on memory functions in post-stroke victims", "sha": "ff911fa4d129aa7b6a8ee3b6e285cc53ada12382", "authors": ["Silvia Kober", "Daniela Schweiger", "Matthias Witte", "Johanna Reichert", "Peter Grieshofer", "Christa Neuper", "Guilherme Wood"], "doi": "10.1186/s12984-015-0105-6", "journal": "Journal of NeuroEngineering and Rehabilitation"}
+{"title": null, "sha": "ffd42084c06bdc643536bb6f53fe53e7b799df9e", "authors": ["Cad", "Rio Sa\u00fade P\u00fablica", "De Janeiro"], "doi": null, "journal": null}
diff --git a/match_test_data/math_universe_releases.json b/match_test_data/math_universe_releases.json
new file mode 100644
index 0000000..a9d1145
--- /dev/null
+++ b/match_test_data/math_universe_releases.json
@@ -0,0 +1,4 @@
+{"abstracts":[],"refs":[],"contribs":[{"index":0,"raw_name":"Jacob Aron","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"Elsevier BV","pages":"6-7","issue":"2882","volume":"215","ext_ids":{"doi":"10.1016/s0262-4079(12)62353-7"},"release_year":2012,"release_stage":"published","release_type":"article-journal","container_id":"6xjeghgubvcdbc3zgvvqqxpor4","work_id":"aaaaaaaaajbsfdz3eay3hckz3m","title":"Fiendish ABC proof heralds new mathematical universe","state":"active","ident":"azp5lcfe35eoboze4syyewq7au","revision":"8da33be8-51f2-4f7b-8f2d-01e69850d9b8","extra":{"crossref":{"alternative-id":["S0262407912623537"],"type":"journal-article"}}}
+{"abstracts":[],"refs":[{"index":0,"extra":{"authors":["P.A.M. Dirac"],"unstructured":"Dirac, P.A.M.: Proc. R. Soc. A 133, 60 (1931)","volume":"133"},"key":"9186_CR1","year":1931,"container_name":"Proc. R. Soc. A","locator":"60"},{"index":1,"extra":{"authors":["E.P. Wigner"],"unstructured":"Wigner, E.P.: Symmetries and Reflections. MIT Press, Cambridge (1967)","volume-title":"Symmetries and Reflections"},"key":"9186_CR2","year":1967,"container_name":"Symmetries and Reflections"},{"index":2,"extra":{"authors":["P. Suppes"],"unstructured":"Suppes, P.: Studies in Methodology and Foundation of Science: Selected Papers from 1951 to 1969. Reidel, Dordrecht (1969)","volume-title":"Studies in Methodology and Foundation of Science: Selected Papers from 1951 to 1969"},"key":"9186_CR3","year":1969,"container_name":"Studies in Methodology and Foundation of Science: Selected Papers from 1951 to 1969"},{"index":3,"extra":{"unstructured":"Zuse, K.: http://www.zib.de/zuse/English_Version/Inhalt/Texte/Chrono/60er/Pdf/76scan.pdf (1976)"},"key":"9186_CR4"},{"index":4,"extra":{"authors":["R. Rucker"],"unstructured":"Rucker, R.: Infinity and the Mind. Birkhäuser, Boston (1982)","volume-title":"Infinity and the Mind"},"key":"9186_CR5","year":1982,"container_name":"Infinity and the Mind"},{"index":5,"extra":{"authors":["J.D. Barrow"],"unstructured":"Barrow, J.D.: Theories of Everything. Ballantine, New York (1991)","volume-title":"Theories of Everything"},"key":"9186_CR6","year":1991,"container_name":"Theories of Everything"},{"index":6,"extra":{"authors":["J.D. Barrow"],"unstructured":"Barrow, J.D.: Pi in the Sky. Clarendon, Oxford (1992)","volume-title":"Pi in the Sky"},"key":"9186_CR7","year":1992,"container_name":"Pi in the Sky"},{"index":7,"extra":{"authors":["P. Davies"],"unstructured":"Davies, P.: The Mind of God. Touchstone, New York (1993)","volume-title":"The Mind of God"},"key":"9186_CR8","year":1993,"container_name":"The Mind of God"},{"index":8,"extra":{"unstructured":"Jackiw, R.: hep-th/9410151 (1994)"},"key":"9186_CR9"},{"index":9,"extra":{"authors":["S. Lloyd"],"unstructured":"Lloyd, S.: Complexity 3, 32 (1997). quant-ph/9912088"},"key":"9186_CR10","year":1997,"container_name":"Complexity","locator":"32"},{"index":10,"extra":{"authors":["M. Tegmark"],"unstructured":"Tegmark, M.: Ann. Phys. 270, 1 (1998). gr-qc/9704009","volume":"270"},"key":"9186_CR11","year":1998,"container_name":"Ann. Phys."},{"index":11,"extra":{"authors":["J. Schmidhuber"],"unstructured":"Schmidhuber, J.: In: Freksa, C. (ed.) Lecture Notes in Computer Science, p. 201. Springer, Berlin (1997). quant-ph/9904050","volume-title":"Lecture Notes in Computer Science"},"key":"9186_CR12","year":1997,"container_name":"Lecture Notes in Computer Science","locator":"201"},{"index":12,"extra":{"authors":["J. Ladyman"],"unstructured":"Ladyman, J.: Stud. Hist. Philos. Sci. 29, 409–424 (1998)","volume":"29"},"key":"9186_CR13","year":1998,"container_name":"Stud. Hist. Philos. Sci.","locator":"409"},{"index":13,"extra":{"authors":["M. Tegmark"],"unstructured":"Tegmark, M.: Sci. Am. 270(5), 40 (2003)","volume":"270"},"key":"9186_CR14","year":2003,"container_name":"Sci. Am.","locator":"40"},{"index":14,"extra":{"unstructured":"Tegmark, M.: astro-ph/0302131 (2003)"},"key":"9186_CR15"},{"index":15,"extra":{"unstructured":"Schmidhuber, J.: quant-ph/0011122 (2000)"},"key":"9186_CR16"},{"index":16,"extra":{"authors":["S. Wolfram"],"unstructured":"Wolfram, S.: A New Kind of Science. Wolfram Media, New York (2002)","volume-title":"A New Kind of Science"},"key":"9186_CR17","year":2002,"container_name":"A New Kind of Science"},{"index":17,"extra":{"unstructured":"Cohen, M.: Master's thesis. Dept. of Philosophy, Ben Gurion University of the Negev, Israel (2003)"},"key":"9186_CR18"},{"index":18,"extra":{"authors":["F.J. Tipler"],"unstructured":"Tipler, F.J.: Rep. Prog. Phys. 68, 897 (2005)","volume":"68"},"key":"9186_CR19","year":2005,"container_name":"Rep. Prog. Phys.","locator":"897"},{"index":19,"extra":{"unstructured":"McCabe, G.: gr-qc/0610016 (2006)"},"key":"9186_CR20"},{"index":20,"extra":{"unstructured":"McCabe, G.: gr-qc/0601073 (2006)"},"key":"9186_CR21"},{"index":21,"extra":{"authors":["R.K. Standish"],"unstructured":"Standish, R.K.: Theory of Nothing. Booksurge, Charleston (2006)","volume-title":"Theory of Nothing"},"key":"9186_CR22","year":2006,"container_name":"Theory of Nothing"},{"index":22,"extra":{"authors":["F. Wilczek"],"issue":"11","unstructured":"Wilczek, F.: Phys. Today 58(11), 8 (2006)","volume":"58"},"key":"9186_CR23","year":2006,"container_name":"Phys. Today"},{"index":23,"extra":{"authors":["F. Wilczek"],"unstructured":"Wilczek, F.: Phys. Today 60(6), 8 (2007)","volume":"60"},"key":"9186_CR24","year":2007,"container_name":"Phys. Today"},{"index":24,"extra":{"authors":["O.E. Rössler"],"unstructured":"Rössler, O.E.: In: Casti, J.L., Karlquist, A. (eds.) Artificial Minds. North-Holland, New York (1987)","volume-title":"Artificial Minds"},"key":"9186_CR25","year":1987,"container_name":"Artificial Minds"},{"index":25,"extra":{"authors":["K. Svozil"],"unstructured":"Svozil, K.: In: Atmanspacher, H., Dalenoort, G.J. (eds.) Inside Versus Outside. Springer, Berlin (1994)","volume-title":"Inside Versus Outside"},"key":"9186_CR26","year":1994,"container_name":"Inside Versus Outside"},{"index":26,"extra":{"authors":["K. Svozil"],"unstructured":"Svozil, K.: In: Trappl, R. (ed.) Cybernetics and Systems '96. Austrian Society for Cybernetic Studies, Vienna (1996)","volume-title":"Cybernetics and Systems '96"},"key":"9186_CR27","year":1996,"container_name":"Cybernetics and Systems '96"},{"index":27,"extra":{"authors":["H. Everett"],"unstructured":"Everett, H.: Rev. Mod. Phys. 29, 454 (1957)","volume":"29"},"key":"9186_CR28","year":1957,"container_name":"Rev. Mod. Phys.","locator":"454"},{"index":28,"extra":{"authors":["N. Everett"],"unstructured":"Everett, N.: In: DeWitt, B.S., Graham, N. (eds.) The Many-Worlds Interpretation of Quantum Mechanics. Princeton University Press, Princeton (1973)","volume-title":"The Many-Worlds Interpretation of Quantum Mechanics"},"key":"9186_CR29","year":1973,"container_name":"The Many-Worlds Interpretation of Quantum Mechanics"},{"index":29,"extra":{"authors":["W. Hodges"],"unstructured":"Hodges, W.: A Shorter Model Theory. Cambridge University Press, Cambridge (1997)","volume-title":"A Shorter Model Theory"},"key":"9186_CR30","year":1997,"container_name":"A Shorter Model Theory"},{"index":30,"extra":{"authors":["H. Weyl"],"unstructured":"Weyl, H.: Space, Time, Matter. Methuen, London (1922)","volume-title":"Space, Time, Matter"},"key":"9186_CR31","year":1922,"container_name":"Space, Time, Matter"},{"index":31,"extra":{"authors":["H.R. Brown"],"unstructured":"Brown, H.R., Brading, K.A.: Dialogos 79, 59 (2002)","volume":"79"},"key":"9186_CR32","year":2002,"container_name":"Dialogos","locator":"59"},{"index":32,"extra":{"authors":["K.A. Brading"],"doi":"10.1017/cbo9780511535369","unstructured":"Brading, K.A., Castellani, E. (eds.): In: Symmetries in Physics: Philosophical Reflections. Cambridge University Press, Cambridge (2003). quant-ph/0301097","volume-title":"Symmetries in Physics: Philosophical Reflections"},"key":"9186_CR33","year":2003,"container_name":"Symmetries in Physics: Philosophical Reflections"},{"index":33,"extra":{"authors":["E. Majorana"],"unstructured":"Majorana, E.: Nuovo Cimento 9, 335 (1932)"},"key":"9186_CR34","year":1932,"container_name":"Nuovo Cimento","locator":"335"},{"index":34,"extra":{"authors":["P.A.M. Dirac"],"unstructured":"Dirac, P.A.M.: Proc. R. Soc. A 155, 447 (1936)","volume":"155"},"key":"9186_CR35","year":1936,"container_name":"Proc. R. Soc. A","locator":"447"},{"index":35,"extra":{"authors":["A. Proca"],"unstructured":"Proca, A.: J. Phys. Rad. 7, 347 (1936)"},"key":"9186_CR36","year":1936,"container_name":"J. Phys. Rad.","locator":"347"},{"index":36,"extra":{"authors":["E.P. Wigner"],"unstructured":"Wigner, E.P.: Ann. Math. 40, 149 (1939)","volume":"40"},"key":"9186_CR37","year":1939,"container_name":"Ann. Math.","locator":"149"},{"index":37,"extra":{"authors":["R.M. Houtappel"],"unstructured":"Houtappel, R.M., van Dam, H., Wigner, E.P.: Rev. Mod. Phys. 37, 595 (1965)","volume":"37"},"key":"9186_CR38","year":1965,"container_name":"Rev. Mod. Phys.","locator":"595"},{"index":38,"extra":{"unstructured":"Deutsch, D.: quant-ph/9906015 (1999)"},"key":"9186_CR39"},{"index":39,"extra":{"unstructured":"Weinberg, S.: hep-th/9702027 (1997)"},"key":"9186_CR40"},{"index":40,"extra":{"authors":["M.J. Rees"],"unstructured":"Rees, M.J.: Our Cosmic Habitat. Princeton University Press, Princeton (2002)","volume-title":"Our Cosmic Habitat"},"key":"9186_CR41","year":2002,"container_name":"Our Cosmic Habitat"},{"index":41,"extra":{"authors":["M. Tegmark"],"unstructured":"Tegmark, M., Aguirre, A., Rees, M.J., Wilczek, F.: Phys. Rev. D 73, 023505 (2006)","volume":"73"},"key":"9186_CR42","year":2006,"container_name":"Phys. Rev. D","locator":"023505"},{"index":42,"extra":{"authors":["G.J. Chaitin"],"doi":"10.1017/cbo9780511608858","unstructured":"Chaitin, G.J.: Algorithmic Information Theory. Cambridge University Press, Cambridge (1987)","volume-title":"Algorithmic Information Theory"},"key":"9186_CR43","year":1987,"container_name":"Algorithmic Information Theory"},{"index":43,"extra":{"authors":["M. Li"],"doi":"10.1007/978-1-4757-2606-0","unstructured":"Li, M., Vitanyi, P.: An Introduction to Kolmogorov Complexity and Its Applications. Springer, Berlin (1997)","volume-title":"An Introduction to Kolmogorov Complexity and Its Applications"},"key":"9186_CR44","year":1997,"container_name":"An Introduction to Kolmogorov Complexity and Its Applications"},{"index":44,"extra":{"authors":["E. Borel"],"unstructured":"Borel, E.: Rend. Circ. Mat. Palermo 26, 247 (1909)","volume":"26"},"key":"9186_CR45","year":1909,"container_name":"Rend. Circ. Mat. Palermo","locator":"247"},{"index":45,"extra":{"authors":["K.L. Chung"],"unstructured":"Chung, K.L.: A Course in Probability Theory. Academic, New York (1974)","volume-title":"A Course in Probability Theory"},"key":"9186_CR46","year":1974,"container_name":"A Course in Probability Theory"},{"index":46,"extra":{"authors":["P.C.W. Davies"],"unstructured":"Davies, P.C.W.: In: Zurek, W.H. (ed.) Complexity, Entropy, and Physical Information, p. 61. Addison-Wesley, Redwood City (1990)","volume-title":"Complexity, Entropy, and Physical Information"},"key":"9186_CR47","year":1990,"container_name":"Complexity, Entropy, and Physical Information","locator":"61"},{"index":47,"extra":{"authors":["M. Tegmark"],"unstructured":"Tegmark, M.: Found. Phys. Lett. 9, 25 (1996)"},"key":"9186_CR48","year":1996,"container_name":"Found. Phys. Lett.","locator":"25"},{"index":48,"extra":{"authors":["H.D. Zeh"],"unstructured":"Zeh, H.D.: The Physical Basis of the Direction of Time, 4th edn. Springer, Berlin (2002)","volume-title":"The Physical Basis of the Direction of Time"},"key":"9186_CR49","year":2002,"container_name":"The Physical Basis of the Direction of Time"},{"index":49,"extra":{"authors":["A. Albrecht"],"unstructured":"Albrecht, A., Sorbo, L.: Phys. Rev. D 70, 063528 (2004)","volume":"70"},"key":"9186_CR50","year":2004,"container_name":"Phys. Rev. D","locator":"063528"},{"index":50,"extra":{"authors":["S.M. Carroll"],"unstructured":"Carroll, S.M., Chen, J.: Gen. Relativ. Gravit. 37, 1671 (2005)","volume":"37"},"key":"9186_CR51","year":2005,"container_name":"Gen. Relativ. Gravit.","locator":"1671"},{"index":51,"extra":{"unstructured":"Wald, R.M.: gr-qc/0507094 (2005)"},"key":"9186_CR52"},{"index":52,"extra":{"unstructured":"Page, D.N.: hep-th/0612137 (2006)"},"key":"9186_CR53"},{"index":53,"extra":{"authors":["A. Vilenkin"],"unstructured":"Vilenkin, A.: J. High Energy Phys. 701, 92 (2007)","volume":"701"},"key":"9186_CR54","year":2007,"container_name":"J. High Energy Phys.","locator":"92"},{"index":54,"extra":{"authors":["L. Boltzmann"],"unstructured":"Boltzmann, L.: Nature 51, 413 (1895)","volume":"51"},"key":"9186_CR55","year":1895,"container_name":"Nature","locator":"413"},{"index":55,"extra":{"authors":["A. Guth"],"unstructured":"Guth, A.: Phys. Rev. D 23, 347 (1981)","volume":"23"},"key":"9186_CR56","year":1981,"container_name":"Phys. Rev. D","locator":"347"},{"index":56,"extra":{"authors":["A. Vilenkin"],"unstructured":"Vilenkin, A.: Phys. Rev. D 27, 2848 (1983)","volume":"27"},"key":"9186_CR57","year":1983,"container_name":"Phys. Rev. D","locator":"2848"},{"index":57,"extra":{"authors":["A.A. Starobinsky"],"unstructured":"Starobinsky, A.A.: Fundamental Interactions, p. 55. MGPI Press, Moscow (1984)","volume-title":"Fundamental Interactions"},"key":"9186_CR58","year":1984,"container_name":"Fundamental Interactions","locator":"55"},{"index":58,"extra":{"authors":["A.D. Linde"],"doi":"10.1201/b16971","unstructured":"Linde, A.D.: Particle Physics and Inflationary Cosmology. Harwood, Switzerland (1990)","volume-title":"Particle Physics and Inflationary Cosmology"},"key":"9186_CR59","year":1990,"container_name":"Particle Physics and Inflationary Cosmology"},{"index":59,"extra":{"unstructured":"Guth, A.H.: hep-th/0702178 (2007)"},"key":"9186_CR60"},{"index":60,"extra":{"authors":["R. Penrose"],"unstructured":"Penrose, R.: N. Y. Acad. Sci. 571, 249 (1989)","volume":"571"},"key":"9186_CR61","year":1989,"container_name":"N. Y. Acad. Sci.","locator":"249"},{"index":61,"extra":{"authors":["S. Hollands"],"unstructured":"Hollands, S., Wald, R.M.: Gen. Relativ. Gravit. 34, 2043 (2002)","volume":"34"},"key":"9186_CR62","year":2002,"container_name":"Gen. Relativ. Gravit.","locator":"2043"},{"index":62,"extra":{"authors":["L. Kofman"],"unstructured":"Kofman, L., Linde, A., Mukhanov, V.: J. High Energy Phys. 10, 57 (2002)","volume":"10"},"key":"9186_CR63","year":2002,"container_name":"J. High Energy Phys.","locator":"57"},{"index":63,"extra":{"authors":["D. Giulini"],"doi":"10.1007/978-3-662-03263-3","unstructured":"Giulini, D., Joos, E., Kiefer, C., Kupsch, J., Stamatescu, I.O., Zeh, H.D.: Decoherence and the Appearance of a Classical World in Quantum Theory. Springer, Berlin (1996)","volume-title":"Decoherence and the Appearance of a Classical World in Quantum Theory"},"key":"9186_CR64","year":1996,"container_name":"Decoherence and the Appearance of a Classical World in Quantum Theory"},{"index":64,"extra":{"authors":["D. Polarski"],"unstructured":"Polarski, D., Starobinsky, A.A.: Class. Quantum Gravity 13, 377 (1996)","volume":"13"},"key":"9186_CR65","year":1996,"container_name":"Class. Quantum Gravity","locator":"377"},{"index":65,"extra":{"authors":["K. Kiefer"],"doi":"10.1002/andp.2090070302","unstructured":"Kiefer, K., Polarski, D.: Ann. Phys. 7, 137 (1998)"},"key":"9186_CR66","year":1998,"container_name":"Ann. Phys.","locator":"137"},{"index":66,"extra":{"authors":["M. Tegmark"],"unstructured":"Tegmark, M.: J. Cosmol. Astropart. Phys. 2005(4), 1 (2005)","volume":"2005"},"key":"9186_CR67","year":2005,"container_name":"J. Cosmol. Astropart. Phys."},{"index":67,"extra":{"authors":["R. Easther"],"unstructured":"Easther, R., Lim, E.A., Martin, M.R.: J. Cosmol. Astropart. Phys. 0603, 16 (2006)","volume":"0603"},"key":"9186_CR68","year":2006,"container_name":"J. Cosmol. Astropart. Phys.","locator":"16"},{"index":68,"extra":{"authors":["R. Bousso"],"unstructured":"Bousso, R.: Phys. Rev. Lett. 97, 191302 (2006)","volume":"97"},"key":"9186_CR69","year":2006,"container_name":"Phys. Rev. Lett.","locator":"191302"},{"index":69,"extra":{"unstructured":"Vilenkin, A.: hep-th/0609193 (2006)"},"key":"9186_CR70"},{"index":70,"extra":{"unstructured":"Aguirre, A., Gratton, S., Johnson, M.C.: hep-th/0611221 (2006)"},"key":"9186_CR71"},{"index":71,"extra":{"authors":["J. Garriga"],"unstructured":"Garriga, J., Vilenkin, A.: Phys. Rev. D 64, 043511 (2001)","volume":"64"},"key":"9186_CR72","year":2001,"container_name":"Phys. Rev. D","locator":"043511"},{"index":72,"extra":{"authors":["D. Deutsch"],"unstructured":"Deutsch, D.: The fabric of reality. Allen Lane, New York (1997)","volume-title":"The fabric of reality"},"key":"9186_CR73","year":1997,"container_name":"The fabric of reality"},{"index":73,"extra":{"unstructured":"Linde, A.D.: hep-th/0211048 (2002)"},"key":"9186_CR74"},{"index":74,"extra":{"authors":["G.F.R. Ellis"],"unstructured":"Ellis, G.F.R., Kirchner, U., Stoeger, W.R.: Mon. Not. R. Astron. Soc. 347, 921 (2004)","volume":"347"},"key":"9186_CR75","year":2004,"container_name":"Mon. Not. R. Astron. Soc.","locator":"921"},{"index":75,"extra":{"unstructured":"Stoeger, W.R., Ellis, G.F.R., Kirchner, U.: astro-ph/0407329 (2004)"},"key":"9186_CR76"},{"index":76,"extra":{"authors":["R.D. Holder"],"unstructured":"Holder, R.D.: God, the Multiverse, and Everything: Modern Cosmology and the Argument from Design. Ashgate, Burlington (2004)","volume-title":"God, the Multiverse, and Everything: Modern Cosmology and the Argument from Design"},"key":"9186_CR77","year":2004,"container_name":"God, the Multiverse, and Everything: Modern Cosmology and the Argument from Design"},{"index":77,"extra":{"unstructured":"Weinberg, S.: hep-th/0511037 (2005)"},"key":"9186_CR78"},{"index":78,"extra":{"authors":["S.M. Carroll"],"unstructured":"Carroll, S.M.: Nature 440, 1132 (2006)","volume":"440"},"key":"9186_CR79","year":2006,"container_name":"Nature","locator":"1132"},{"index":79,"extra":{"unstructured":"Page, D.N.: hep-th/0610101 (2006)"},"key":"9186_CR80"},{"index":80,"extra":{"authors":["P. Davies"],"unstructured":"Davies, P.: In: Carr, B. (ed.) Universe or Multiverse? Cambridge University Press, Cambridge (2007)","volume-title":"Universe or Multiverse?"},"key":"9186_CR81","year":2007,"container_name":"Universe or Multiverse?"},{"index":81,"extra":{"authors":["M. Kaku"],"unstructured":"Kaku, M.: Parallel Worlds: A Journey Through Creation, Higher Dimensions, and the Future of the Cosmos. Anchor, New York (2006)","volume-title":"Parallel Worlds: A Journey Through Creation, Higher Dimensions, and the Future of the Cosmos"},"key":"9186_CR82","year":2006,"container_name":"Parallel Worlds: A Journey Through Creation, Higher Dimensions, and the Future of the Cosmos"},{"index":82,"extra":{"authors":["A. Vilenkin"],"unstructured":"Vilenkin, A.: Many Worlds in One: The Search for Other Universes. Hill and Wang, New York (2006)","volume-title":"Many Worlds in One: The Search for Other Universes"},"key":"9186_CR83","year":2006,"container_name":"Many Worlds in One: The Search for Other Universes"},{"index":83,"extra":{"authors":["R. Bousso"],"unstructured":"Bousso, R., Polchinski, J.: J. High Energy Phys. 6, 6 (2000)"},"key":"9186_CR84","year":2000,"container_name":"J. High Energy Phys."},{"index":84,"extra":{"authors":["J.L. Feng"],"unstructured":"Feng, J.L., March-Russell, J., Sethi, S., Wilczek, F.: Nucl. Phys. B 602, 307 (2001)","volume":"602"},"key":"9186_CR85","year":2001,"container_name":"Nucl. Phys. B","locator":"307"},{"index":85,"extra":{"authors":["S. Kachru"],"unstructured":"Kachru, S., Kallosh, R., Linde, A., Trivedi, S.P.: Phys. Rev. D 68, 046005 (2003)","volume":"68"},"key":"9186_CR86","year":2003,"container_name":"Phys. Rev. D","locator":"046005"},{"index":86,"extra":{"unstructured":"Susskind, L.: hep-th/0302219 (2003)"},"key":"9186_CR87"},{"index":87,"extra":{"authors":["S. Ashok"],"unstructured":"Ashok, S., Douglas, M.R.: J. High Energy Phys. 401, 60 (2004)","volume":"401"},"key":"9186_CR88","year":2004,"container_name":"J. High Energy Phys.","locator":"60"},{"index":88,"extra":{"authors":["S. Feferman"],"unstructured":"Feferman, S.: In the Light of Logic. Oxford University Press, Oxford (1998), Chap. 14","volume-title":"In the Light of Logic"},"key":"9186_CR89","year":1998,"container_name":"In the Light of Logic"},{"index":89,"extra":{"authors":["R. Hersh"],"unstructured":"Hersh, R.: What Is Mathematics, Really? Oxford University Press, Oxford (1999)","volume-title":"What Is Mathematics, Really?"},"key":"9186_CR90","year":1999,"container_name":"What Is Mathematics, Really?"},{"index":90,"extra":{"authors":["D. Lewis"],"unstructured":"Lewis, D.: On the Plurality of Worlds. Blackwell, Oxford (1986)","volume-title":"On the Plurality of Worlds"},"key":"9186_CR91","year":1986,"container_name":"On the Plurality of Worlds"},{"index":91,"extra":{"authors":["S. Hawking"],"unstructured":"Hawking, S.: A Brief History of Time. Touchstone, New York (1993)","volume-title":"A Brief History of Time"},"key":"9186_CR92","year":1993,"container_name":"A Brief History of Time"},{"index":92,"extra":{"authors":["G.F.R. Ellis"],"unstructured":"Ellis, G.F.R.: Class. Quantum Gravity A 16, 37 (1999)","volume":"16"},"key":"9186_CR93","year":1999,"container_name":"Class. Quantum Gravity A","locator":"37"},{"index":93,"extra":{"unstructured":"Schmidhuber, C.: hep-th/0011065 (2000)"},"key":"9186_CR94"},{"index":94,"extra":{"authors":["C.J. Hogan"],"unstructured":"Hogan, C.J.: Rev. Mod. Phys. 72, 1149 (2000)","volume":"72"},"key":"9186_CR95","year":2000,"container_name":"Rev. Mod. Phys.","locator":"1149"},{"index":95,"extra":{"authors":["P. Benioff"],"unstructured":"Benioff, P.: Phys. Rev. A 63, 032305 (2001)","volume":"63"},"key":"9186_CR96","year":2001,"container_name":"Phys. Rev. A","locator":"032305"},{"index":96,"extra":{"authors":["G.F.R. Ellis"],"unstructured":"Ellis, G.F.R.: Int. J. Mod. Phys. A 17, 2667 (2002)","volume":"17"},"key":"9186_CR97","year":2002,"container_name":"Int. J. Mod. Phys. A","locator":"2667"},{"index":97,"extra":{"authors":["N. Bostrom"],"unstructured":"Bostrom, N.: Anthropic Bias: Observation Selection Effects in Science and Philosophy. Routledge, New York (2002)","volume-title":"Anthropic Bias: Observation Selection Effects in Science and Philosophy"},"key":"9186_CR98","year":2002,"container_name":"Anthropic Bias: Observation Selection Effects in Science and Philosophy"},{"index":98,"extra":{"authors":["P. Benioff"],"unstructured":"Benioff, P.: Found. Phys. 32, 989 (2002)","volume":"32"},"key":"9186_CR99","year":2002,"container_name":"Found. Phys.","locator":"989"},{"index":99,"extra":{"unstructured":"Benioff, P.: quant-ph/0303086 (2003)"},"key":"9186_CR100"},{"index":100,"extra":{"authors":["M.M. Circovic"],"unstructured":"Circovic, M.M.: Found. Phys. 33, 467 (2003)","volume":"33"},"key":"9186_CR101","year":2003,"container_name":"Found. Phys.","locator":"467"},{"index":101,"extra":{"unstructured":"Vaas, R.: physics/0408111 (2004)"},"key":"9186_CR102"},{"index":102,"extra":{"unstructured":"Aguirre, A., Tegmark, M.: hep-th/0409072 (2004)"},"key":"9186_CR103"},{"index":103,"extra":{"authors":["P. Benioff"],"unstructured":"Benioff, P.: Found. Phys. 35, 1825 (2004)","volume":"35"},"key":"9186_CR104","year":2004,"container_name":"Found. Phys.","locator":"1825"},{"index":104,"extra":{"unstructured":"McCabe, G.: http://philsci-archive.pitt.edu/archive/00002218 (2005)"},"key":"9186_CR105"},{"index":105,"extra":{"authors":["P. Hut"],"unstructured":"Hut, P., Alford, M., Tegmark, M.: Found. Phys. 36, 765 (2006) physics/0510188","volume":"36"},"key":"9186_CR106","year":2006,"container_name":"Found. Phys.","locator":"765"},{"index":106,"extra":{"authors":["B. Vorhees"],"unstructured":"Vorhees, B., Luxford, C., Rhyan, A.: Int. J. Unconv. Comput. 1, 69 (2005)"},"key":"9186_CR107","year":2005,"container_name":"Int. J. Unconv. Comput.","locator":"69"},{"index":107,"extra":{"unstructured":"Ellis, G.F.R.: astro-ph/0602280 (2006)"},"key":"9186_CR108"},{"index":108,"extra":{"unstructured":"Stoeger, W.R.: astro-ph/0602356 (2006)"},"key":"9186_CR109"},{"index":109,"extra":{"unstructured":"Hedrich, R.: physics/0604171 (2006)"},"key":"9186_CR110"},{"index":110,"extra":{"authors":["K.E. Drexler"],"unstructured":"Drexler, K.E.: Engines of Creation: The Coming Era of Nanotechnology. Forth Estate, London (1985)","volume-title":"Engines of Creation: The Coming Era of Nanotechnology"},"key":"9186_CR111","year":1985,"container_name":"Engines of Creation: The Coming Era of Nanotechnology"},{"index":111,"extra":{"authors":["N. Bostrom"],"unstructured":"Bostrom, N.: Int. J. Futur. Stud. 2, 1 (1998)"},"key":"9186_CR112","year":1998,"container_name":"Int. J. Futur. Stud."},{"index":112,"extra":{"authors":["R. Kurzweil"],"unstructured":"Kurzweil, R.: The Age of Spiritual Machines: When Computers Exceed Human Intelligence. Viking, New York (1999)","volume-title":"The Age of Spiritual Machines: When Computers Exceed Human Intelligence"},"key":"9186_CR113","year":1999,"container_name":"The Age of Spiritual Machines: When Computers Exceed Human Intelligence"},{"index":113,"extra":{"authors":["H. Moravec"],"unstructured":"Moravec, H.: Robot: Mere Machine to Transcendent Mind. Oxford University Press, Oxford (1999)","volume-title":"Robot: Mere Machine to Transcendent Mind"},"key":"9186_CR114","year":1999,"container_name":"Robot: Mere Machine to Transcendent Mind"},{"index":114,"extra":{"authors":["F.J. Tipler"],"unstructured":"Tipler, F.J.: The Physics of Immortality. Doubleday, New York (1994)","volume-title":"The Physics of Immortality"},"key":"9186_CR115","year":1994,"container_name":"The Physics of Immortality"},{"index":115,"extra":{"authors":["N. Bostrom"],"unstructured":"Bostrom, N.: Philos. Q. 53, 243 (2003)","volume":"53"},"key":"9186_CR116","year":2003,"container_name":"Philos. Q.","locator":"243"},{"index":116,"extra":{"authors":["G. McCabe"],"unstructured":"McCabe, G.: Stud. Hist. Philos. Mod. Phys. 36, 591 (2005). physics/0511116","volume":"36"},"key":"9186_CR117","year":2005,"container_name":"Stud. Hist. Philos. Mod. Phys.","locator":"591"},{"index":117,"extra":{"authors":["R. Penrose"],"unstructured":"Penrose, R.: The Emperor's New Mind. Oxford University Press, Oxford (1989)","volume-title":"The Emperor's New Mind"},"key":"9186_CR118","year":1989,"container_name":"The Emperor's New Mind"},{"index":118,"extra":{"authors":["R. Penrose"],"unstructured":"Penrose, R.: In: Longair, M. (ed.) The Large, the Small and the Human Mind. Cambridge University Press, Cambridge (1997)","volume-title":"The Large, the Small and the Human Mind"},"key":"9186_CR119","year":1997,"container_name":"The Large, the Small and the Human Mind"},{"index":119,"extra":{"authors":["T. Hafting"],"unstructured":"Hafting, T.: Nature 436, 801 (2005)","volume":"436"},"key":"9186_CR120","year":2005,"container_name":"Nature","locator":"801"},{"index":120,"extra":{"authors":["R. Gambini"],"unstructured":"Gambini, R., Porto, R., Pullin, J.: New J. Phys. 6, 45 (2004)"},"key":"9186_CR121","year":2004,"container_name":"New J. Phys.","locator":"45"},{"index":121,"extra":{"authors":["G. Egan"],"unstructured":"Egan, G.: Permutation City. Harper, New York (1995)","volume-title":"Permutation City"},"key":"9186_CR122","year":1995,"container_name":"Permutation City"},{"index":122,"extra":{"unstructured":"Standish, R.K.: Phys. Found. Lett. 17 (2004)"},"key":"9186_CR123"},{"index":123,"extra":{"authors":["M. Davis"],"unstructured":"Davis, M.: Computability and Unsolvability. Dover, New York (1982)","volume-title":"Computability and Unsolvability"},"key":"9186_CR124","year":1982,"container_name":"Computability and Unsolvability"},{"index":124,"extra":{"authors":["D. Hilbert"],"unstructured":"Hilbert, D., Bernays, P.: Grundlagen der Matematik. Springer, Berlin (1934)","volume-title":"Grundlagen der Matematik"},"key":"9186_CR125","year":1934,"container_name":"Grundlagen der Matematik"},{"index":125,"extra":{"authors":["K. Gödel"],"unstructured":"Gödel, K.: Monatshefte Math. Phys. 38, 173 (1931)","volume":"38"},"key":"9186_CR126","year":1931,"container_name":"Monatshefte Math. Phys.","locator":"173"},{"index":126,"extra":{"authors":["S.G. Simpson"],"unstructured":"Simpson, S.G.: J. Symb. Log. 53, 349 (1988) http://www.math.psu.edu/simpson/papers/hilbert.pdf","volume":"53"},"key":"9186_CR127","year":1988,"container_name":"J. Symb. Log.","locator":"349"},{"index":127,"extra":{"authors":["J.W. Dawson"],"unstructured":"Dawson, J.W.: In: 21st Annual IEEE Symposium on Logic in Computer Science, p. 339. IEEE, New York (2006)","volume-title":"21st Annual IEEE Symposium on Logic in Computer Science"},"key":"9186_CR128","year":2006,"container_name":"21st Annual IEEE Symposium on Logic in Computer Science","locator":"339"},{"index":128,"extra":{"authors":["A. Church"],"unstructured":"Church, A.: Am. J. Math. 58, 345 (1936)","volume":"58"},"key":"9186_CR129","year":1936,"container_name":"Am. J. Math.","locator":"345"},{"index":129,"extra":{"authors":["A. Turing"],"unstructured":"Turing, A.: Proc. Lond. Math. Soc. 42, 230 (1936)","volume":"42"},"key":"9186_CR130","year":1936,"container_name":"Proc. Lond. Math. Soc.","locator":"230"},{"index":130,"extra":{"authors":["R.L. Goodstein"],"unstructured":"Goodstein, R.L.: Constructive Formalism, Essays on the Foundations of Mathematics. Leister University College, Leicester (1951)","volume-title":"Constructive Formalism, Essays on the Foundations of Mathematics"},"key":"9186_CR131","year":1951,"container_name":"Constructive Formalism, Essays on the Foundations of Mathematics"},{"index":131,"extra":{"authors":["X. Wen"],"unstructured":"Wen, X.: Prog. Theor. Phys. Suppl. 160, 351 (2006). cond-mat/0508020","volume":"160"},"key":"9186_CR132","year":2006,"container_name":"Prog. Theor. Phys. Suppl.","locator":"351"},{"index":132,"extra":{"unstructured":"Levin, M., Wen, X.: hep-th/0507118 (2005)"},"key":"9186_CR133"},{"index":133,"extra":{"authors":["J.D. Barrow"],"unstructured":"Barrow, J.D., Tipler, F.J.: The Anthropic Cosmological Principle. Clarendon, Oxford (1986)","volume-title":"The Anthropic Cosmological Principle"},"key":"9186_CR134","year":1986,"container_name":"The Anthropic Cosmological Principle"},{"index":134,"extra":{"authors":["A.D. Linde"],"unstructured":"Linde, A.D.: In: Hawking, S., Israel, W. (eds.) 300 Years of Gravitation. Cambridge University Press, Cambridge (1987)","volume-title":"300 Years of Gravitation"},"key":"9186_CR135","year":1987,"container_name":"300 Years of Gravitation"},{"index":135,"extra":{"authors":["S. Weinberg"],"unstructured":"Weinberg, S.: Phys. Rev. Lett. 59, 2607 (1987)","volume":"59"},"key":"9186_CR136","year":1987,"container_name":"Phys. Rev. Lett.","locator":"2607"},{"index":136,"extra":{"authors":["A.D. Linde"],"unstructured":"Linde, A.D.: Phys. Lett. B 201, 437 (1988)","volume":"201"},"key":"9186_CR137","year":1988,"container_name":"Phys. Lett. B","locator":"437"},{"index":137,"extra":{"unstructured":"Tegmark, M., Vilenkin, A., Pogosian, L.: astro-ph/0304536 (2003)"},"key":"9186_CR138"},{"index":138,"extra":{"authors":["L. Pogosian"],"unstructured":"Pogosian, L., Vilenkin, A., Tegmark, M.: J. Cosmol. Astropart. Phys. 407, 5 (2004)","volume":"407"},"key":"9186_CR139","year":2004,"container_name":"J. Cosmol. Astropart. Phys."},{"index":139,"extra":{"authors":["R. Jones"],"unstructured":"Jones, R.: Philos. Sci. 58, 185 (1991)","volume":"58"},"key":"9186_CR140","year":1991,"container_name":"Philos. Sci.","locator":"185"},{"index":140,"extra":{"authors":["O. Pooley"],"unstructured":"Pooley, O.: In: Rickles, D.P., French, S.R.D. (eds.) The Structural Foundations of Quantum Gravity. Oxford University Press, Oxford (2007)","volume-title":"The Structural Foundations of Quantum Gravity"},"key":"9186_CR141","year":2007,"container_name":"The Structural Foundations of Quantum Gravity"},{"index":141,"extra":{"unstructured":"Larsson, T.A.: math-ph/0103013v3 (2001)"},"key":"9186_CR142"}],"contribs":[{"index":0,"raw_name":"Max Tegmark","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"Springer Nature","pages":"101-150","volume":"38","ext_ids":{"doi":"10.1007/s10701-007-9186-9","wikidata_qid":"Q54060087","core":"1933957"},"release_year":2007,"release_date":"2007-11-08","release_stage":"published","release_type":"article-journal","container_id":"6xxkdcdu35buta54bw7h2hblju","work_id":"ftl6xv267vb6xfech3khri3nwa","title":"The Mathematical Universe","state":"active","ident":"uooqfttis5cp3b3lgd7ltesg2a","revision":"8db356b3-279c-485a-9036-cd9762928b5a","extra":{"crossref":{"alternative-id":["9186"],"type":"journal-article"}}}
+{"abstracts":[{"sha1":"79671b863d8a2cd888746b3e8c2e6471caec19fd","content":"I explore physics implications of the External Reality Hypothesis (ERH) that\nthere exists an external physical reality completely independent of us humans.\nI argue that with a sufficiently broad definition of mathematics, it implies\nthe Mathematical Universe Hypothesis (MUH) that our physical world is an\nabstract mathematical structure. I discuss various implications of the ERH and\nMUH, ranging from standard physics topics like symmetries, irreducible\nrepresentations, units, free parameters, randomness and initial conditions to\nbroader issues like consciousness, parallel universes and Godel incompleteness.\nI hypothesize that only computable and decidable (in Godel's sense) structures\nexist, which alleviates the cosmological measure problem and help explain why\nour physical laws appear so simple. I also comment on the intimate relation\nbetween mathematical structures, computations, simulations and physical\nsystems.","mimetype":"text/plain","lang":"en"}],"refs":[],"contribs":[{"index":0,"raw_name":"Max Tegmark","role":"author"}],"language":"en","version":"v1","ext_ids":{"arxiv":"0704.0646v1"},"release_year":2007,"release_date":"2007-04-05","release_stage":"submitted","release_type":"article","work_id":"ftl6xv267vb6xfech3khri3nwa","title":"The Mathematical Universe","state":"active","ident":"ornjo5inebc7nkv6gux3lzdpx4","revision":"0e3a84c8-1955-49e8-be2f-28da1daf727b","extra":{"arxiv":{"base_id":"0704.0646","categories":["gr-qc","astro-ph","hep-th"],"comments":"Replaced to match accepted Found. Phys. version, 31 pages, 5 figs;\n more details at http://space.mit.edu/home/tegmark/toe.html","journal_ref":"Found.Phys.38:101-150,2008"},"superceded":true}}
+{"abstracts":[{"sha1":"79671b863d8a2cd888746b3e8c2e6471caec19fd","content":"I explore physics implications of the External Reality Hypothesis (ERH) that\nthere exists an external physical reality completely independent of us humans.\nI argue that with a sufficiently broad definition of mathematics, it implies\nthe Mathematical Universe Hypothesis (MUH) that our physical world is an\nabstract mathematical structure. I discuss various implications of the ERH and\nMUH, ranging from standard physics topics like symmetries, irreducible\nrepresentations, units, free parameters, randomness and initial conditions to\nbroader issues like consciousness, parallel universes and Godel incompleteness.\nI hypothesize that only computable and decidable (in Godel's sense) structures\nexist, which alleviates the cosmological measure problem and help explain why\nour physical laws appear so simple. I also comment on the intimate relation\nbetween mathematical structures, computations, simulations and physical\nsystems.","mimetype":"text/plain","lang":"en"}],"refs":[],"contribs":[{"index":0,"raw_name":"Max Tegmark","role":"author"}],"language":"en","version":"v2","ext_ids":{"arxiv":"0704.0646v2"},"release_year":2007,"release_date":"2007-10-08","release_stage":"accepted","release_type":"article","work_id":"bbbbbbbb7vb6xfech3khri3nwa","title":"The Mathematical Universe","state":"active","ident":"domerrq5mvdqfdbolkunct3esy","revision":"540ac97b-eb73-43d5-a1da-cfbcf1f50eb2","extra":{"arxiv":{"base_id":"0704.0646","categories":["gr-qc","astro-ph","hep-th"],"comments":"Replaced to match accepted Found. Phys. version, 31 pages, 5 figs;\n more details at http://space.mit.edu/home/tegmark/toe.html","journal_ref":"Found.Phys.38:101-150,2008"}}}
diff --git a/nginx/README.md b/nginx/README.md
new file mode 100644
index 0000000..0369f9b
--- /dev/null
+++ b/nginx/README.md
@@ -0,0 +1,18 @@
+
+This folder contains nginx configs for partner access to sandcrawler DB
+(postgrest) and GROBID XML blobs (minio).
+
+`fatcat-blobs` is part of the fatcat.wiki ansible config, but included here to
+show how it works.
+
+## Let's Encrypt
+
+As... bnewbold?
+
+ sudo certbot certonly \
+ --non-interactive \
+ --agree-tos \
+ --email bnewbold@archive.org \
+ --webroot -w /var/www/letsencrypt \
+ -d sandcrawler-minio.fatcat.wiki \
+ -d sandcrawler-db.fatcat.wiki
diff --git a/nginx/fatcat-blobs b/nginx/fatcat-blobs
new file mode 100644
index 0000000..5c692ef
--- /dev/null
+++ b/nginx/fatcat-blobs
@@ -0,0 +1,51 @@
+
+server {
+ listen 80;
+ listen [::]:80;
+ listen 443 ssl http2;
+ listen [::]:443 ssl http2;
+ server_name blobs.fatcat.wiki;
+
+ ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem;
+ ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem;
+
+ #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'";
+ add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5
+ add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5
+ add_header X-Xss-Protection "1";
+ # Enable STS with one year period (breaks http; optional)
+ #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains";
+
+ error_log /var/log/nginx/fatcat-errors.log;
+ access_log /dev/null;
+
+ if ($scheme = http) {
+ return 301 https://$server_name$request_uri;
+ }
+
+ location /unpaywall/ {
+ if ($request_method !~ "GET") {
+ return 403;
+ break;
+ }
+
+ #proxy_pass http://sandcrawler-minio.fatcat.wiki:9000$uri$is_args$args;
+ proxy_pass http://207.241.227.141:9000$uri$is_args$args;
+ proxy_redirect off;
+
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header Host $http_host;
+ }
+
+ location / {
+ default_type text/plain;
+ return 504 'blobs.fatcat.wiki hosts many files; full URLs are required!\nyou probably want https://fatcat.wiki/ instead';
+ }
+
+ # Let's Encrypt SSL Certs
+ location /.well-known/acme-challenge/ {
+ root /var/www/letsencrypt;
+ autoindex off;
+ }
+}
diff --git a/nginx/sandcrawler-db b/nginx/sandcrawler-db
new file mode 100644
index 0000000..67d1a2d
--- /dev/null
+++ b/nginx/sandcrawler-db
@@ -0,0 +1,80 @@
+
+upstream postgrest {
+ server localhost:3030;
+ keepalive 64;
+}
+
+server {
+ listen 80;
+ listen [::]:80;
+ listen 443 ssl http2;
+ listen [::]:443 ssl http2;
+ server_name sandcrawler-db.fatcat.wiki db.sandcrawler.org;
+
+ ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem;
+ ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem;
+
+ #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'";
+ add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5
+ add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5
+ add_header X-Xss-Protection "1";
+ # Enable STS with one year period (breaks http; optional)
+ #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains";
+
+ error_log /var/log/nginx/sandcrawler-errors.log;
+ access_log /dev/null;
+
+ if ($scheme = http) {
+ return 301 https://$server_name$request_uri;
+ }
+
+ location / {
+
+ default_type application/json;
+
+ if ($request_method !~ "GET") {
+ return 403;
+ break;
+ }
+
+ proxy_redirect off;
+
+ proxy_http_version 1.1;
+ proxy_set_header Connection "";
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header Host $http_host;
+
+ proxy_pass http://postgrest/;
+ }
+
+ # support /endpoint/:id url style for sha1hex lookups
+ location ~ "^/(file_meta|grobid|fatcat_file)/([a-f0-9]{40})$" {
+
+ if ($request_method !~ "GET") {
+ return 403;
+ break;
+ }
+
+ # assuming an upstream named "postgrest"
+ # doing this rewrite as part of the proxy_pass line itself didn't seem
+ # to work, so doing a formal rewrite here
+ rewrite "/([a-z_]+)/([a-f0-9]{40})" /$1?sha1hex=eq.$2 break;
+ proxy_pass http://postgrest;
+
+ # make the response singular
+ #default_type application/vnd.pgrst.object+json;
+ proxy_set_header Accept "application/vnd.pgrst.object+json";
+
+ proxy_http_version 1.1;
+ proxy_set_header Connection "";
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ }
+
+ # Let's Encrypt SSL Certs
+ location /.well-known/acme-challenge/ {
+ root /var/www/letsencrypt;
+ autoindex off;
+ }
+}
diff --git a/nginx/sandcrawler-minio b/nginx/sandcrawler-minio
new file mode 100644
index 0000000..2e9bfe3
--- /dev/null
+++ b/nginx/sandcrawler-minio
@@ -0,0 +1,57 @@
+
+server {
+ listen 80;
+ listen [::]:80;
+ listen 443 ssl http2;
+ listen [::]:443 ssl http2;
+ server_name sandcrawler-minio.fatcat.wiki minio.sandcrawler.org;
+
+ ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem;
+ ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem;
+
+ #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'";
+ add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5
+ add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5
+ add_header X-Xss-Protection "1";
+ # Enable STS with one year period (breaks http; optional)
+ #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains";
+
+ error_log /var/log/nginx/sandcrawler-errors.log;
+ access_log /dev/null;
+
+ if ($scheme = http) {
+ return 301 https://$server_name$request_uri;
+ }
+
+ location /minio/ {
+
+ # allows all HTTP verbs
+
+ proxy_pass http://localhost:9000;
+ proxy_redirect off;
+
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header Host $http_host;
+ }
+
+ location / {
+ if ($request_method !~ "GET") {
+ return 403;
+ break;
+ }
+
+ proxy_pass http://localhost:9000;
+ proxy_redirect off;
+
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header Host $http_host;
+ }
+
+ # Let's Encrypt SSL Certs
+ location /.well-known/acme-challenge/ {
+ root /var/www/letsencrypt;
+ autoindex off;
+ }
+}
diff --git a/notes/backfill_scalding_rewrite.txt b/notes/backfill_scalding_rewrite.txt
new file mode 100644
index 0000000..f5fb1d1
--- /dev/null
+++ b/notes/backfill_scalding_rewrite.txt
@@ -0,0 +1,22 @@
+
+Background context needed:
+- CDX text file format
+- rough arch outline (what runs where)
+- basic hadoop+hbase overview
+- hbase schema
+- quick look at hadoop and hbase web interfaces
+- maybe quick re-profile?
+
+Plan/Steps:
+x together: get *any* JVM map/reduce thing to build and run on cluster
+x together: get something to build that talks to hbase
+x basic JVM test infra; HBase mockup. "shopping"
+ => scalding and/or cascading
+x simple hbase scan report generation (counts/stats)
+x CDX parsing
+- complete backfill script
+
+Spec for CDX backfill script:
+- input is CDX, output to HBase table
+- filter input before anything ("defensive"; only PDF, HTTP 200, size limit)
+- reads HBase before insert; don't overwrite
diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md
new file mode 100644
index 0000000..d330e9b
--- /dev/null
+++ b/notes/crawl_cdx_merge.md
@@ -0,0 +1,29 @@
+
+## New Way
+
+Run script from scratch repo:
+
+ ~/scratch/bin/cdx_collection.py CRAWL-2000
+
+ zcat CRAWL-2000.cdx.gz | wc -l
+
+ # update crawl README/ANALYSIS/whatever
+
+Assuming we're just looking at PDFs:
+
+ zcat CRAWL-2000.cdx.gz | rg -i pdf | sort -S 4G -u > CRAWL-2000.sorted.cdx
+
+## Old Way
+
+Use metamgr to export an items list.
+
+Get all the CDX files and merge/sort:
+
+ mkdir CRAWL-2000 && cd CRAWL-2000
+ cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz
+ ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx
+ sort -S 4G -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
+ wc -l CRAWL-2000.cdx
+ rm CRAWL-2000.unsorted.cdx
+
+ # gzip and upload to petabox, or send to HDFS, or whatever
diff --git a/notes/fuzzy_match_notes.md b/notes/fuzzy_match_notes.md
new file mode 100644
index 0000000..a87364c
--- /dev/null
+++ b/notes/fuzzy_match_notes.md
@@ -0,0 +1,148 @@
+
+These are notes on how bibliographic metadata matches (of records) and
+slugification (to create lookup keys on title strings) worked in the past in
+the sandcrawler repository. Eg, circa 2018.
+
+## Scala Slug-ification
+
+Original title strings longer than 1023 characters were rejected (before
+slug-ifying).
+
+There was a "slug-denylist". Additionally, scorable strings needed to be
+between 8 and 1023 characters (not bytes) long (inclusive)
+
+Slugification transform was:
+
+- lower-case
+- remove whitespace ("\s")
+- strip specific accent characters:
+ '\u0141' -> 'L',
+ '\u0142' -> 'l', // Letter ell
+ '\u00d8' -> 'O',
+ '\u00f8' -> 'o'
+- remove all '\p{InCombiningDiacriticalMarks}'
+- remove punctuation:
+ \p{Punct}
+ ’·“â€â€˜â€™â€œâ€Â«Â»ã€Œã€Â¿â€“±§
+
+Partially adapted from apache commons: <https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934>
+
+My original notes/proposal:
+
+1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+2. strip accents
+3. "lower-case" (unicode-aware)
+4. do any final custom/manual mappings
+
+Resulting slugs less than 8 characters long were rejected, and slugs were
+checked against a denylist.
+
+Only 554 entries in the denylist; could just ship that in the library.
+
+
+## Python Tokenization
+
+- "&apos;" -> "'"
+- remove non "isalnum()" characters
+- encode as ASCII; this removes diacritics etc, but also all non-latin character sets
+- optionally remove all whitespace
+
+
+## Python GROBID Cleanups
+
+These are likely pretty GROBID-specific. Article title was required, but any of
+the other filtered-out fields just resulted in partial metadata. These filters
+are the result of lots of manual verification of results, and doing things like
+taking truncating titles and looking at the most popular prefixes for a large
+random sample.
+
+Same denylist for title slugs as Scala, plus:
+
+ editorial
+ advertisement
+ bookreviews
+ reviews
+ nr
+ abstractoriginalarticle
+ originalarticle
+ impactfactor
+ articlenumber
+
+Other filters on title strings (any of these bad):
+
+- 500 or more characters long
+- tokenized string less than 10 characters
+- tokenized starts with 'nr' or 'issn'
+- lowercase starts with 'int j' or '.int j'
+- contains both "volume" and "issue"
+- contains "downloadedfrom"
+- fewer than 2 or more than 50 tokens (words)
+- more than 12 tokens only a single character long
+- more than three ":"; more than one "|"; more than one "."
+
+Remove title prefixes (but allow):
+
+ "Title: "
+ "Original Article: "
+ "Original Article "
+ "Article: "
+
+Denylist for authors:
+
+ phd
+ phdstudent
+
+Journal name processing:
+
+- apply title denylist
+- remove prefixes
+ characters: /~&©
+ Original Research Article
+ Original Article
+ Research Article
+ Available online www.jocpr.com
+- remove suffixes
+ Available online at www.sciarena.com
+ Original Article
+ Available online at
+ ISSN
+ ISSUE
+- remove anywhere
+ e-ISSN
+ p-ISSN
+
+## Python Grouping Comparison
+
+Would consume joined groups, row-by-row. At most 10 matches per group; any more
+and skip (this was for file-to-release).
+
+Overall matching requirements:
+
+- string similarity threshold from scala code
+ https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ https://stackoverflow.com/questions/955110/similarity-string-comparison-in-java/16018452#16018452
+- authors should be consistent
+ - convert one author list into space-separated tokens
+ - remove "jr." from all author token lists
+ - the last word for each author full name in the other list (eg, the lastname),
+ tokenized, must be in the token set
+- if both years defined, then must match exactly (integers)
+
+In the code, there is a note:
+
+ Note: the actual importer/merger should filter the following patterns out:
+ - container title has "letter" and "diar"
+ - contribs (authors) contain "&NA;"
+ - dates differ (not just year)
+
+
+## Scala Metadata Keys
+
+Only the titles were ever actually used (in scala), but the keys allowed were:
+
+- title
+- authors (list of strings)
+- year (int)
+- contentType
+- doi
+
diff --git a/notes/grobid_munging.txt b/notes/grobid_munging.txt
new file mode 100644
index 0000000..013e458
--- /dev/null
+++ b/notes/grobid_munging.txt
@@ -0,0 +1,70 @@
+
+In docker:
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg | pv -l | rg 'OA-JOURNAL-CRAWL-2019-08' > OA-JOURNAL-CRAWL-2019-08.grobid.json
+ # 5.01M 0:31:04 [2.69k/s]
+ # 277 GByte grobid-output.prod.json
+
+Then:
+
+ cat grobid-output.prod.json | rg 'OA-JOURNAL-CRAWL-2019-08' | pv -l > OA-JOURNAL-CRAWL-2019-08.grobid.json
+ # 265k 0:32:12 [ 137 /s]
+
+ pigz grobid-output.prod.json
+ # 63 GByte grobid-output.prod.json.gz
+
+ cat OA-JOURNAL-CRAWL-2019-08.grobid.json | pv -l | jq "[.key, .status, .status_code, .error_msg] | @tsv" -r | sort -u -S 4G | uniq --check-chars 40 > OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+ # 265k
+
+ wc -l OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+ # 212879 OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+
+ cut -f2 OA-JOURNAL-CRAWL-2019-08.grobid.tsv | sort | uniq -c
+ # 14087 error
+ # 198792 success
+
+In sandcrawler pipenv:
+
+ head -n100 /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.grobid.json | ./grobid_tool.py transform --metadata-only - > /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.metadata.json.sample
+
+ cat /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.grobid.json | parallel --linebuffer --round-robin --pipe -j8 ./grobid_tool.py transform --metadata-only - > /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.metadata.json
+
+ cat OA-JOURNAL-CRAWL-2019-08.metadata.json | rg -v '"fatcat_release": null' > OA-JOURNAL-CRAWL-2019-08.metadata.matched.json
+
+ wc -l OA-JOURNAL-CRAWL-2019-08.metadata.matched.json OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+ # 28162 OA-JOURNAL-CRAWL-2019-08.metadata.matched.json
+ # 212879 OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+
+Next steps:
+- import the matched files (while verifying match)
+- some web interface to make sandcrawler easier?
+ input: sha1 or url
+ view: grobid status and metadata, ML results, fatcat metadata (via API lookup)
+ links/actions: view PDF, re-run GROBID, add to a release (via API)
+
+## BAD/BROKEN
+
+All these following didn't work because old versions of kafkacat only read
+partial results. Ended up using docker to run more recent ubuntu, sigh.
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg -e | pv -l > grobid-output.prod.json
+
+ cat grobid-output.prod.json | rg '"status": "success"' > grobid-output.prod.success.json
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg -e | pv -l | rg '"status": "success"' > grobid-output.prod.success.json
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg -e | pv -l | rg 'OA-JOURNAL-CRAWL-2019-08' > OA-JOURNAL-CRAWL-2019-08.grobid.json
+
+ head -n200 /grande/oa-crawl-grobid/grobid-output.prod.success.json | ./grobid_tool.py transform --metadata-only - | jq "[.fatcat_release, .biblio.title]" -c | less
+
+
+ cat OA-JOURNAL-CRAWL-2019-08.grobid.json | parallel --pipe -j8 jq .status -r | sort | uniq -c
+ 1879 error
+ 26698 success
+
+
+For full grobid-output was looking like:
+
+ 318561 error
+ 199607 success
+
diff --git a/notes/hadoop_job_log.md b/notes/hadoop_job_log.md
new file mode 100644
index 0000000..f812c0a
--- /dev/null
+++ b/notes/hadoop_job_log.md
@@ -0,0 +1,210 @@
+
+### QA matchcrossref
+
+[D8C7F2CA7620450991838D540489948D/8B17786779BE44579C98D8A325AC5959] sandcrawler.ScoreJob/(1/1) ...-24-2102.32-matchcrossref
+
+Submitted: Fri Aug 24 21:03:09 UTC 2018
+Started: Fri Aug 24 21:03:20 UTC 2018
+Finished: Sat Aug 25 09:46:55 UTC 2018
+Elapsed: 12hrs, 43mins, 34sec
+Diagnostics:
+Average Map Time 24mins, 31sec
+Average Shuffle Time 15sec
+Average Merge Time 21sec
+Average Reduce Time 7mins, 17sec
+
+Map 2312 2312
+Reduce 100 100
+
+crossref-rows-filtered 73901964 0 73901964
+grobid-rows-filtered 1092992 0 1092992
+joined-rows 0 623837 623837
+
+cascading.flow.StepCounters
+Tuples_Read 94831255 0 94831255
+Tuples_Written 0 623837 623837
+
+Read_Duration 7108430 352241 7460671
+Tuples_Read 94831255 74994956 169826211
+Tuples_Written 74994956 623837 75618793
+Write_Duration 7650302 21468 7671770
+
+## QA UnGrobided
+
+Submitted: Sat Aug 25 01:23:22 UTC 2018
+Started: Sat Aug 25 05:06:36 UTC 2018
+Finished: Sat Aug 25 05:13:45 UTC 2018
+Elapsed: 7mins, 8sec
+Diagnostics:
+Average Map Time 1mins, 20sec
+Average Shuffle Time 12sec
+Average Merge Time 15sec
+Average Reduce Time 29sec
+
+Map 48 48
+Reduce 1 1
+
+bnewbold@bnewbold-dev$ gohdfs du -sh sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part*
+56.8M /user/bnewbold/sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part-00000
+
+## Prod UnGrobided
+
+[D76F6BF91D894E879E747C868B0DEDE7/394A1AFC44694992B71E6920AF8BA3FB] sandcrawler.DumpUnGrobidedJob/(1/1) ...26-0910.25-dumpungrobided
+
+Map 278 278
+Reduce 1 1
+
+Submitted: Sun Aug 26 09:10:51 UTC 2018
+Started: Sun Aug 26 09:18:21 UTC 2018
+Finished: Sun Aug 26 10:29:28 UTC 2018
+Elapsed: 1hrs, 11mins, 7sec
+Diagnostics:
+Average Map Time 4mins, 48sec
+Average Shuffle Time 24mins, 17sec
+Average Merge Time 14sec
+Average Reduce Time 13mins, 54sec
+
+
+cading.flow.StepCounters
+Name
+Map
+Reduce
+Total
+Tuples_Read 64510564 0 64510564
+Tuples_Written 0 21618164 21618164
+
+## Prod Crossref Match
+
+[6C063C0809244446BA8602C3BE99CEC2/5FE5D87899154F38991A1ED58BEB34D4] sandcrawler.ScoreJob/(1/1) ...-25-1753.01-matchcrossref
+
+Map 2427 2427
+Reduce 50 50
+
+Submitted: Sat Aug 25 17:53:50 UTC 2018
+Started: Sat Aug 25 17:53:59 UTC 2018
+Finished: Sun Aug 26 11:22:52 UTC 2018
+Elapsed: 17hrs, 28mins, 52sec
+Diagnostics:
+Average Map Time 31mins, 20sec
+Average Shuffle Time 1mins, 21sec
+Average Merge Time 41sec
+Average Reduce Time 3hrs, 14mins, 39sec
+
+crossref-rows-filtered 73901964 0 73901964
+grobid-rows-filtered 14222226 0 14222226
+joined-rows 0 14115453 14115453
+
+## "Prod" Fatcat Group Works (run 2019-08-10)
+
+ ./please --prod groupworks-fatcat hdfs:///user/bnewbold/release_export.2019-07-07.json
+
+ job_1559844455575_118299
+ http://ia802401.us.archive.org:6988/proxy/application_1559844455575_118299
+
+## Re-GROBID batch (2019-11-12)
+
+Want to re-process "old" GROBID output with newer (0.5.5+fatcat) GROBID version
+(vanilla training) plus biblio-glutton identification. Hoping to make a couple
+million new fatcat matches; will probably do a later round of ML matching over
+this batch as well.
+
+ # in /grande/regrobid
+
+ # as postgres
+ psql sandcrawler < dump_regrobid_pdf.sql > dump_regrobid_pdf.txt
+
+ # as bnewbold
+ cat dump_regrobid_pdf.txt | sort -S 4G | uniq -w 40 | cut -f2 | pv -l > dump_regrobid_pdf.2019-11-12.json
+ # 41.5M lines, uniq by SHA1
+ # NOTE: not the full 56m+ from GROBID table... some in archive.org, others
+ # not application/pdf type. Will need to follow-up on those later
+
+ # intend to have 3 worker machines, but splitting 6 ways in case we need to
+ # re-balance load or get extra machines or something
+ split -n l/6 -a1 -d --additional-suffix=.json dump_regrobid_pdf.2019-11-12.json regrobid_cdx.split_
+
+ # distribute to tmp001, tmp002, tmp003:
+ tmp001: 0,1
+ tmp002: 2,3
+ tmp003: 4,5
+
+ # test local grobid config:
+ head /srv/sandcrawler/tasks/regrobid_cdx.split_0.json | pv -l | ./grobid_tool.py --grobid-host http://localhost:8070 -j0 extract-json - > example_out.json
+ # expect at least a couple fatcat matches
+ cat example_out.json | jq .tei_xml -r | rg fatcat
+
+ # test GROBID+kafka config:
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | head | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+ # full run, in a screen session
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+NOTE: really should get parallel kafka worker going soon. if there is a reboot
+or something in the middle of this process, will need to re-run from the start.
+
+Was getting a bunch of weird kafka INVALID_MSG errors on produce. Would be nice to be able to retry, so doing:
+
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel --joblog regrobid_job.log --retries 5 -j40 --linebuffer --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+Never mind, going to split into chunks which can be retried.
+
+ cd /srv/sandcrawler/tasks
+ sudo chown sandcrawler:staff .
+ cat regrobid_cdx.split_* | split -l 20000 -a4 -d --additional-suffix=.json - chunk_
+ ls /srv/sandcrawler/tasks/chunk_*.json | parallel -j4 ./extract_chunk.sh {}
+
+extract_chunk.sh:
+
+
+ #!/bin/bash
+
+ set -x -e -u -o pipefail
+
+ if [ -f $1.SUCCESS ]; then
+ echo "Skipping: $1..."
+ exit
+ fi
+
+ echo "Extracting $1..."
+
+ date
+ cat $1 | parallel -j10 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+ touch $1.SUCCESS
+
+seems to be working better! tested and if there is a problem with one chunk the others continue
+
+## Pig Joins (around 2019-12-24)
+
+Partial (as a start):
+
+ pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
+
+ HadoopVersion PigVersion UserId StartedAt FinishedAt Features
+2.6.0-cdh5.11.2 0.12.0-cdh5.0.1 bnewbold 2019-12-27 00:39:38 2019-12-27 15:32:44 HASH_JOIN,ORDER_BY,DISTINCT,FILTER
+
+ Success!
+
+ Job Stats (time in seconds):
+ JobId Maps Reduces MaxMapTime MinMapTIme AvgMapTime MedianMapTime MaxReduceTime MinReduceTime AvgReduceTime MedianReducetime Alias Feature Outputs
+ job_1574819148370_46540 4880 0 143 10 27 21 n/a n/a n/a n/a cdx MAP_ONLY
+ job_1574819148370_46541 19 0 59 9 25 18 n/a n/a n/a n/a digests MAP_ONLY
+ job_1574819148370_46773 24 1 17 7 10 9 6 6 6 6 digests SAMPLER
+ job_1574819148370_46774 7306 1 55 4 7 7 25 25 25 25 cdx SAMPLER
+ job_1574819148370_46778 7306 40 127 8 18 15 4970 1936 2768 2377 cdx ORDER_BY
+ job_1574819148370_46779 24 20 80 24 60 66 90 26 38 37 digests ORDER_BY
+ job_1574819148370_46822 22 3 101 27 53 48 1501 166 735 539 DISTINCT
+ job_1574819148370_46828 7146 959 122 7 16 14 91 21 35 32 full_join,result HASH_JOIN /user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx,
+
+ Input(s):
+ Successfully read 1968654006 records (654323590996 bytes) from: "/user/bnewbold/pdfs/gwb-pdf-20191005172329"
+ Successfully read 74254196 records (2451575849 bytes) from: "/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted"
+
+ Output(s):
+ Successfully stored 0 records in: "/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx"
+
+Oops! Didn't upper-case the sha1b32 output.
+
+Full GWB:
+
+ pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
diff --git a/notes/hbase_table_sizes.txt b/notes/hbase_table_sizes.txt
new file mode 100644
index 0000000..97bbb16
--- /dev/null
+++ b/notes/hbase_table_sizes.txt
@@ -0,0 +1,12 @@
+
+As of 2018-05-29:
+- qa rows: 1,246,013
+- prod rows: 8,974,188
+
+As of 2018-06-16:
+- qa: 1,246,013
+- prod: 18,308,086
+
+As of 2018-08-01:
+- qa: 1,246,013
+- prod: 18,308,141
diff --git a/notes/html_ingest_notes.md b/notes/html_ingest_notes.md
new file mode 100644
index 0000000..a1a91f3
--- /dev/null
+++ b/notes/html_ingest_notes.md
@@ -0,0 +1,318 @@
+
+## Current Plan
+
+- selectolax to extract metadata and quickly filter (speed)
+ => eg, differentiate landing pages from fulltext
+ => also embed URLs?
+- trafilatura for fulltext body extract
+- no solution yet for reference parsing
+ => maybe trafilatura XML-TEI parsing, then GROBID?
+ => especially if DOI/identifier/URL is in the reference
+
+
+
+TODO:
+x print/wrap error condition better
+x serialize dates (pydantic)
+x CDX lookup "closest" to capture datetime (or by month)
+x firstmonday no extracted fulltext/XML
+x apply URL base fixup to fulltext URLs
+x XML alternative detection
+x basic ingest worker, kafka topics, persist workers, sql table, etc
+- ingest worker: landing page to actual fulltext (eg, OJS)
+- broken? https://betterexplained.com/articles/colorized-math-equations/
+
+Ponder:
+- CDX lookup older successful captures
+ http://www.altdevblogaday.com/2011/05/17/understanding-the-fourier-transform/
+ => optional filter by status? "reduce" by month/year?
+- detect scope heuristically
+ bepress_is_article_cover_page 1
+ citation_fulltext_world_readable "" (eg, distill)
+- non-success subresource fetches
+ https://www.europenowjournal.org/2020/10/11/a-social-history-of-early-rock-n-roll-in-germany-hamburg-from-burlesque-to-the-beatles-1956-1969/
+- redirects: keep start URL?
+
+Later:
+- XML URL extraction
+ https://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-19652002000200001&lng=en&nrm=iso&tlng=pt
+ <a href="http://www.scielo.br/scieloOrg/php/articleXML.php?pid=S0100-19652002000200001&amp;lang=en" rel="nofollow" target="xml">
+- selectolax bug? hangs: `css_first("meta['thing']")`
+- youtube embed
+ => download/include actual video file?
+- parse references in citation headers
+- try parsing references in HTML fulltext
+
+## Testing URLs
+
+- PLOS
+ https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949
+ TODO: "May 9, 2014"
+ TODO: appendix
+- peerj
+ https://peerj.com/articles/4375/
+- scielo
+ http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032020000200081&lng=en&nrm=iso&tlng=es
+ bunch of little icon .png, but ok
+ redirect of an image not saved in webcapture
+- wordpress
+ https://www.europenowjournal.org/2020/10/11/a-social-history-of-early-rock-n-roll-in-germany-hamburg-from-burlesque-to-the-beatles-1956-1969/
+ no HTML meta? hrm
+- old OJS
+ (pdf only) http://rjh.folium.ru/index.php/rjh/article/view/1511
+- new OJS
+ https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729
+- plain HTML
+ http://journal.sjdm.org/12/12627/jdm12627.html
+- blogs/essays
+ http://symbolflux.com/lodessay/
+ https://betterexplained.com/articles/colorized-math-equations/
+ https://web.archive.org/web/20120418231513/http://www.altdevblogaday.com/2011/05/17/understanding-the-fourier-transform/
+ https://research.google.com/bigpicture/attacking-discrimination-in-ml/
+ http://www.econgraphs.org/
+- journal homepage (not fulltext)
+- OJS new landing page (not fulltext)
+- OJS old (not fulltext)
+ http://rjh.folium.ru/index.php/rjh/index
+ http://rjh.folium.ru/index.php/rjh/issue/view/106
+ http://rjh.folium.ru/index.php/rjh/article/view/382
+- distill
+ https://distill.pub/2020/bayesian-optimization/
+ https://distill.pub/2018/feature-wise-transformations/
+- youtube video embed
+ http://www.cond.org/persalog.html
+- youtube video direct?
+- github: project README?
+- wikipedia
+
+## Background Research
+
+- scrapy (?)
+- requests-html: can run javascript
+ => good for metadata extraction?
+- selectolax
+- scrapely: give HTML and extracted text, it builds the parser
+ => good for difficult one-off cases?
+- https://rushter.com/blog/python-fast-html-parser/
+- WET generation from WARC, a la common crawl
+- https://towardsdatascience.com/categorizing-world-wide-web-c130abd9b717
+
+Other random stuff:
+- distilBERT: most BERT accuracy, 0.4 factor latency (faster)?
+ https://medium.com/huggingface/distilbert-8cf3380435b5
+- htmldate: finds "date of publication" for a document
+- adblockparser
+ => good as a filter in HTML ingest
+- w3lib: utility library. unicode conversion; cleanups; etc
+- courlan: clean/normalize/sample large URL lists
+ => https://github.com/adbar/courlan
+
+### Main Text Extraction
+
+Things to try:
+
+- newspaper3k
+ => basic article extraction. lxml
+- trafilatura
+ => TEI-XML output!
+ => looks very promising
+ => falls back to readability and justext
+- python-readability
+ => improved vs newspaper?
+- dragnet
+- eatiht
+- jusText
+- inscriptis
+ => emphasis on shape/readability of text output? compare with lynx
+- Goose3
+ => metadata and article text
+- news-please
+ => very full-featured. build on scrapy, newspaper, readability
+ => can iterate over common crawl?
+- html2text
+ => actually HTML-to-markdown; no or little "boilerplate removal"
+- boilerpipe (Java)
+ boilerpipe3 (wrapper)
+ boilerpy3 (port)
+
+Comparisons and articles:
+
+- https://www.diffbot.com/benefits/comparison/
+- https://github.com/scrapinghub/article-extraction-benchmark
+ - https://github.com/scrapinghub/article-extraction-benchmark/releases/download/v1.0.0/paper-v1.0.0.pdf
+- https://github.com/rundimeco/waddle
+
+- https://moz.com/devblog/benchmarking-python-content-extraction-algorithms-dragnet-readability-goose-and-eatiht
+- https://hal.archives-ouvertes.fr/hal-02768510v3/document (fr; June 2020)
+ https://translate.google.com/translate?sl=auto&tl=en&u=https%3A%2F%2Fhal.archives-ouvertes.fr%2Fhal-02768510v3%2Fdocument
+- http://eprints.fri.uni-lj.si/1718/1/Kovacic-1.pdf (2012)
+- "Generic Web Content Extraction with Open-Source Software" (2020; trafilatura)
+- "Out-of-the-Box and Into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools"
+ https://hal.archives-ouvertes.fr/hal-02732851/document
+ very on-topic
+- https://cloud.google.com/blog/products/gcp/problem-solving-with-ml-automatic-document-classification
+
+### Reference/Citation Extraction
+
+"Locating and parsing bibliographic references in HTML medical articles"
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2903768/
+
+cb2bib (in debian/ubuntu)
+
+
+### Metadata Extraction
+
+OJS 3.x seems to have `citation_fulltext_html_url`. Annoyingly, has an iframe.
+
+http://documents.clockss.org/index.php/LOCKSS:_Extracting_Bibliographic_Metadata
+
+https://blog.dshr.org/2013/04/talk-on-lockss-metadata-extraction-at.html
+
+"OXPath": declaritive XPath extension for scraping metadata
+https://journal.code4lib.org/articles/13007
+
+
+## newspaper3k experimentation
+
+ import newspaper
+
+ import nltk
+ nltk.download('punkt')
+
+ # first mondays (OJS) fulltext
+ monday = newspaper.Article("https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729?inline=1")
+ # => ugh, iframe
+ monday.download()
+ monday.parse() # several seconds
+
+ monday.title
+ # Surveillance, stigma and sociotechnical design for HIV
+ monday.text
+ # reasonable; similar to pdftotext?
+ monday.authors
+ # empty
+ monday.images
+ # reasonable?
+
+ nih = newspaper.Article('https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html')
+ nih.download()
+ nih.parse()
+ nih.nlp()
+
+ nih.title
+ # Migration of Monographic Citations to LocatorPlus: Merge Project. NLM Technical Bulletin. Jul-Aug 2002
+ # duplicate journal name in title
+ nih.authors
+ # none
+ nih.text
+ # Ok. missing first character, weirdly
+
+ genders = newspaper.Article('https://web.archive.org/web/20141230080932id_/http://www.genders.org/g58/g58_fairlie.html')
+ genders.download()
+ genders.parse()
+
+ genders.title
+ # Presenting innovative theories in art, literature, history, music, TV and film.
+ # nope: this is title of the journal
+
+ genders.text
+ # Ok. includes title and author in the body.
+
+ dlib = newspaper.Article('http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html')
+ dlib.download()
+ dlib.parse()
+
+ dlib.title
+ # Transforming Libraries and Archives through Crowdsourcing
+ dlib.authors()
+ # none
+ dlib.text
+ # some other junk, but main body there
+
+## trafilatura experimentation
+
+ trafilatura --json -u 'http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html' | jq .
+
+ trafilatura --xmltei -u 'http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html'
+
+Does not work with `first_monday_ojs_inline`?
+
+May need to test/compare more.
+
+Examples/bugs:
+
+ http://web.archive.org/web/20081120141035id_/http://www.mundanebehavior.org/issues/v5n1/jones.htm
+ poor title detection
+
+ generally, author detection not great.
+ not, apparently, using detection of dc.authors etc
+
+
+## Prod Deployment Notes (2020-12-14)
+
+Created `html_meta` table in `sandcrawler-db`.
+
+Updated ansible roles to deploy persist and import workers. Then ran the roles
+and enabled:
+
+- sandcrawler database (aitio)
+ - sandcrawler-persist-ingest-file-worker@1: restarted
+- blobs (wbgrp-svc169)
+ - sandcrawler-persist-html-teixml-worker@1: started and enabled
+ - sandcrawler-persist-xml-doc-worker@1: started and enabled
+- fatcat prod worker (wbgrp-svc502)
+ - fatcat-import-ingest-web-worker: started and enabled
+
+Test some d-lib and first monday ingests:
+
+ # dlib
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html --limit 50 container --container-id ugbiirfvufgcjkx33r3cmemcuu
+ => Counter({'estimate': 803, 'ingest_request': 50, 'elasticsearch_release': 50, 'kafka': 50})
+
+ # first monday
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html --limit 50 container --container-id svz5ul6qozdjhjhk7d627avuja
+
+Starting:
+
+ d-lib: 253 / 1056 preserved (https://fatcat.wiki/container/ugbiirfvufgcjkx33r3cmemcuu/coverage)
+
+Initially, `fatcat-import-ingest-web-worker` is seeing these but doesn't seem
+to be importing.
+
+ # postgresql shell
+ select sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count from html_meta;
+ => initially has_teixml is false for all
+ => fixed in an update
+
+ # weed shell
+ > fs.ls /buckets/sandcrawler/html_body
+ [...]
+ > fs.cat /buckets/sandcrawler/html_body/77/75/7775adf8c7e19151bbe887bfa08a575483291d7c.tei.xml
+ [looks like fine TEI-XML]
+
+Going to debug ingest issue by dumping results to disk and importing manually
+(best way to see counts):
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o -10 | rg html | head -n10 | jq . -c > web_ingest_results.json
+
+ export FATCAT_AUTH_WORKER_CRAWL=[...]
+ ./fatcat_import.py ingest-web-results web_ingest_results.json
+ => Counter({'total': 10, 'skip-update-disabled': 9, 'skip': 1, 'skip-hit': 1, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # did some patching (f7a75a01), then re-ran twice and got:
+ => Counter({'total': 10, 'insert': 9, 'skip': 1, 'skip-hit': 1, 'update': 0, 'exists': 0})
+ => Counter({'total': 10, 'exists': 9, 'skip': 1, 'skip-hit': 1, 'insert': 0, 'update': 0})
+
+ # looks good!
+
+Re-ingesting all of d-lib:
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id ugbiirfvufgcjkx33r3cmemcuu
+ => Expecting 803 release objects in search queries
+ => Counter({'ingest_request': 803, 'elasticsearch_release': 803, 'estimate': 803, 'kafka': 803})
+
+TODO:
+
+- release ES transform isn't counting these as `in_ia` or preserved (code-only change)
+- no indication in search results (ES schema change)
+- ingest tool should probably look at `in_ia_html` or `in_ia_pdf` for PDF/XML queries (or a `types_in_ia` list?)
diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore
new file mode 100644
index 0000000..343a25c
--- /dev/null
+++ b/notes/ingest/.gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.json
diff --git a/notes/ingest/2019-10-23_testing.md b/notes/ingest/2019-10-23_testing.md
new file mode 100644
index 0000000..481c4e2
--- /dev/null
+++ b/notes/ingest/2019-10-23_testing.md
@@ -0,0 +1,8 @@
+
+exported not-archived DOIs for elife, as well as general list.
+
+ wc -l recent\ missing\ oa\ releases.csv
+ 161828 recent missing oa releases.csv
+
+ wc -l missing\ elife\ DOIs.csv
+ 1779 missing elife DOIs.csv
diff --git a/notes/ingest/2020-01-14_bulk.md b/notes/ingest/2020-01-14_bulk.md
new file mode 100644
index 0000000..9d05cda
--- /dev/null
+++ b/notes/ingest/2020-01-14_bulk.md
@@ -0,0 +1,26 @@
+
+Generate ingest requests from arabesque:
+
+ zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json
+
+ zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json
+
+
+Quick tests locally:
+
+ time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json
+ time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json
+
+These are all wayback success; looking good! Single threaded, from home laptop
+(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even
+with 30x parallelism. Should re-test on actual server. GROBID pre-check should
+help?
+
+With new bulk topic:
+
+ head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Ok, let them rip:
+
+ cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2020-02-04_ingest_backfills.md b/notes/ingest/2020-02-04_ingest_backfills.md
new file mode 100644
index 0000000..73a42ef
--- /dev/null
+++ b/notes/ingest/2020-02-04_ingest_backfills.md
@@ -0,0 +1,148 @@
+
+
+## Using Fatcat Tool
+
+Want to enqueue some backfill URLs to crawl, now that SPNv2 is on the mend.
+
+Example dry-run:
+
+ ./fatcat_ingest.py --dry-run --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife
+
+Big OA from 2020 (past month):
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 158 release objects in search queries
+ Counter({'ingest_request': 158, 'estimate': 158, 'kafka': 158, 'elasticsearch_release': 158})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name elife
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 2312 release objects in search queries
+ Counter({'kafka': 2312, 'ingest_request': 2312, 'elasticsearch_release': 2312, 'estimate': 2312})
+
+ # note: did 100 first to test
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name plos
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 1185 release objects in search queries
+ Counter({'estimate': 1185, 'ingest_request': 1185, 'elasticsearch_release': 1185, 'kafka': 1185})
+
+ ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 89 release objects in search queries
+ Counter({'elasticsearch_release': 89, 'estimate': 89, 'ingest_request': 89, 'kafka': 89})
+
+ ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher ieee
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 499 release objects in search queries
+ Counter({'kafka': 499, 'ingest_request': 499, 'estimate': 499, 'elasticsearch_release': 499})
+
+ ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name bmj
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 28 release objects in search queries
+ Counter({'elasticsearch_release': 28, 'ingest_request': 28, 'kafka': 28, 'estimate': 28})
+
+ ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 6225 release objects in search queries
+ Counter({'estimate': 6225, 'kafka': 500, 'elasticsearch_release': 500, 'ingest_request': 500})
+
+ ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 2920 release objects in search queries
+ Counter({'estimate': 2920, 'elasticsearch_release': 1001, 'ingest_request': 1000, 'kafka': 1000})
+
+Hip corona virus papers:
+
+ ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 5332 release objects in search queries
+ Counter({'estimate': 5332, 'elasticsearch_release': 2159, 'ingest_request': 2000, 'kafka': 2000})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 110 release objects in search queries
+ Counter({'ingest_request': 110, 'kafka': 110, 'elasticsearch_release': 110, 'estimate': 110})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 589 release objects in search queries
+ Counter({'estimate': 589, 'elasticsearch_release': 589, 'ingest_request': 552, 'kafka': 552})
+
+
+Mixed eLife results:
+
+ ["wrong-mimetype",null,"https://elifesciences.org/articles/54551"]
+ ["success",null,"https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNTE2OTEvZWxpZmUtNTE2OTEtdjEucGRm/elife-51691-v1.pdf?_hash=Jp1cLog1NzIlU%2BvjgLdbM%2BuphOwe5QWUn%2F97tbQBNG4%3D"]
+
+## Re-Request Failed
+
+Select some failed injest request rows to re-enqueue:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ ) TO '/grande/snapshots/reingest_spn2cdx_20200205.rows.json';
+ -- 1536 rows
+
+Transform back to full requests:
+
+ ./scripts/ingestrequest_row2json.py reingest_spn2cdx_20200205.rows.json > reingest_spn2cdx_20200205.json
+
+Push into kafka (on a kafka broker node):
+
+ cat ~/reingest_spn2cdx_20200205.json | jq . -c | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests -p -1
+
+More:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'error:%'
+ ) TO '/grande/snapshots/reingest_spn2err1_20200205.rows.json';
+ -- COPY 1516
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-error%'
+ ) TO '/grande/snapshots/reingest_spn2err2_20200205.rows.json';
+ -- COPY 16678
+
+The next large ones to try would be `wayback-error` and `cdx-error`, though
+these are pretty generic. Could go kafka output to try and understand those
+error classes better.
+
+Oof, as a mistake enqueued to partition 1 instead of -1 (random), so these will
+take a week or more to actually process. Re-enqueued as -1; ingesting from
+wayback is pretty fast, this should result mostly wayback ingests. Caught up by
+end of weekend?
+
+## Check Coverages
+
+As follow-ups:
+
+ elife: https://fatcat.wiki/container/en4qj5ijrbf5djxx7p5zzpjyoq/coverage
+ => 2020-02-24: 7187 / 8101 = 88% preserved
+ archivist: https://fatcat.wiki/container/zpobyv4vbranllc7oob56tgci4/coverage
+ => 85 preserved
+ => 2020-02-24: 85 / 3005 preserved (TODO)
+ jcancer: https://fatcat.wiki/container/nkkzpwht7jd3zdftc6gq4eoeey/coverage
+ => 2020 preserved
+ => 2520 preserved
+ => 2020-02-24: 2700 / 2766 preserved
+ plos: https://fatcat.wiki/container/23nqq3odsjhmbi5tqavvcn7cfm/coverage
+ => 2020-02-24: 7580 / 7730 = 98% preserved
+
diff --git a/notes/ingest/2020-02-18_ingest_backfills.md b/notes/ingest/2020-02-18_ingest_backfills.md
new file mode 100644
index 0000000..1ab18f4
--- /dev/null
+++ b/notes/ingest/2020-02-18_ingest_backfills.md
@@ -0,0 +1,42 @@
+
+Select:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-error%'
+ ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json';
+ => COPY 6537
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'wayback-error'
+ ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json';
+ => COPY 33022
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json
+ ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json
+
+Push to kafka:
+
+ cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+Many had null `ingest_request_source`, so won't actually import into fatcat:
+
+ bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n
+ 1 "savepapernow-web"
+ 112 "fatcat-ingest-container"
+ 11750 "fatcat-changelog"
+ 21159 null
+
diff --git a/notes/ingest/2020-02-21_ingest_backfills.md b/notes/ingest/2020-02-21_ingest_backfills.md
new file mode 100644
index 0000000..48df910
--- /dev/null
+++ b/notes/ingest/2020-02-21_ingest_backfills.md
@@ -0,0 +1,104 @@
+
+Follow-ups to last ingest backfill. Only run these when ingest request topic is
+empty, and full persist chain has run successfully.
+
+## Corona virus stuff
+
+ ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV
+
+## Large OA Publishers
+
+Should probably check domain stats/success for all of these first.
+
+Would also be good to have a "randomize" option. Could fake that by dumping to
+disk first.
+
+ ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier
+
+ ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer
+
+ # ???
+ ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+
+## Fixed OA Publishers (small tests)
+
+ # american archivist
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ => Expecting 2920 release objects in search queries
+ => Counter({'estimate': 2920, 'elasticsearch_release': 26, 'ingest_request': 25, 'kafka': 25})
+ => good
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter
+ => Expecting 42897 release objects in search queries
+ => Counter({'estimate': 42897, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25})
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher frontiers
+ => Expecting 35427 release objects in search queries
+ => Counter({'estimate': 35427, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25})
+ => mixed results?
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi
+ => Expecting 43111 release objects in search queries
+ => Counter({'estimate': 43111, 'elasticsearch_release': 25, 'ingest_request': 25, 'kafka': 25})
+ => success, fast
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "American Heart Association"
+ => Expecting 185240 release objects in search queries
+ => Counter({'estimate': 185240, 'kafka': 25, 'ingest_request': 25, 'elasticsearch_release': 25})
+ => no success? or mixed? skip for now
+
+ # Environmental Health Perspectives (NIH)
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky
+ => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"]
+ => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"]
+ => FIXED
+ => good (but slow?)
+
+ ./fatcat_ingest.py --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "Tomsk State University"
+ => Expecting 578057 release objects in search queries
+ => Counter({'estimate': 578057, 'elasticsearch_release': 50, 'kafka': 50, 'ingest_request': 50})
+ => nothing from tsu.ru? skip for now
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent"
+ => Expecting 4602 release objects in search queries
+ => Counter({'estimate': 4602, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25})
+ => good
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*"
+ => Expecting 5690 release objects in search queries
+ => Counter({'estimate': 5690, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25})
+ => good
+
+
+## Fixed OA Publishers (full runs)
+
+ # american archivist
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ Expecting 2920 release objects in search queries
+ Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter
+ Expecting 42986 release objects in search queries
+ Counter({'estimate': 42986, 'elasticsearch_release': 42986, 'kafka': 42935, 'ingest_request': 42935})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi
+ Expecting 43108 release objects in search queries
+ Counter({'estimate': 43108, 'elasticsearch_release': 43108, 'ingest_request': 41262, 'kafka': 41262})
+
+ # Environmental Health Perspectives (NIH)
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky
+ Expecting 12699 release objects in search queries
+ Counter({'elasticsearch_release': 12699, 'estimate': 12699, 'kafka': 12615, 'ingest_request': 12615})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent"
+ Expecting 4602 release objects in search queries
+ Counter({'estimate': 4602, 'ingest_request': 4602, 'kafka': 4602, 'elasticsearch_release': 4602})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*"
+ Expecting 5690 release objects in search queries
+ Counter({'ingest_request': 5690, 'kafka': 5690, 'estimate': 5690, 'elasticsearch_release': 5690})
+
diff --git a/notes/ingest/2020-02-22_fixed_domain.txt b/notes/ingest/2020-02-22_fixed_domain.txt
new file mode 100644
index 0000000..a60de42
--- /dev/null
+++ b/notes/ingest/2020-02-22_fixed_domain.txt
@@ -0,0 +1,246 @@
+
+www.degruyter.com
+
+ "/view/books/" didn't have citation_pdf_url, so added custom URL rule.
+
+ Not sure why redirect-loop happening, but isn't with current live ingest
+ tool?
+
+ domain | status | count
+ -------------------+-------------------------+-------
+ www.degruyter.com | redirect-loop | 22023
+ www.degruyter.com | no-pdf-link | 8773
+ www.degruyter.com | no-capture | 8617
+ www.degruyter.com | success | 840
+ www.degruyter.com | link-loop | 59
+ www.degruyter.com | terminal-bad-status | 23
+ www.degruyter.com | wrong-mimetype | 12
+ www.degruyter.com | spn-error | 4
+ www.degruyter.com | spn2-cdx-lookup-failure | 4
+ www.degruyter.com | spn2-error:proxy-error | 1
+ www.degruyter.com | spn-remote-error | 1
+ www.degruyter.com | gateway-timeout | 1
+ www.degruyter.com | petabox-error | 1
+ (13 rows)
+
+www.frontiersin.org
+
+ no pdf link
+
+ seems to live ingest fine? files served from "*.blob.core.windows.net"
+ no fix, just re-ingest.
+
+ domain | status | count
+ ---------------------+-------------------------+-------
+ www.frontiersin.org | no-pdf-link | 17503
+ www.frontiersin.org | terminal-bad-status | 6696
+ www.frontiersin.org | wayback-error | 203
+ www.frontiersin.org | no-capture | 20
+ www.frontiersin.org | spn-error | 6
+ www.frontiersin.org | gateway-timeout | 3
+ www.frontiersin.org | wrong-mimetype | 3
+ www.frontiersin.org | spn2-cdx-lookup-failure | 2
+ www.frontiersin.org | spn2-error:job-failed | 2
+ www.frontiersin.org | spn-remote-error | 1
+ www.frontiersin.org | cdx-error | 1
+ (11 rows)
+
+www.mdpi.com
+
+ terminal-bad-status
+
+ Seems to ingest fine live? No fix, just re-ingest.
+
+ domain | status | count
+ --------------+-------------------------+-------
+ www.mdpi.com | terminal-bad-status | 13866
+ www.mdpi.com | wrong-mimetype | 2693
+ www.mdpi.com | wayback-error | 513
+ www.mdpi.com | redirect-loop | 505
+ www.mdpi.com | success | 436
+ www.mdpi.com | no-capture | 214
+ www.mdpi.com | no-pdf-link | 43
+ www.mdpi.com | spn2-cdx-lookup-failure | 34
+ www.mdpi.com | gateway-timeout | 3
+ www.mdpi.com | petabox-error | 2
+ (10 rows)
+
+www.ahajournals.org | no-pdf-link | 5727
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.ahajournals.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.ahajournals.org%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ---------------------+----------------+-------
+ www.ahajournals.org | no-pdf-link | 5738
+ www.ahajournals.org | wrong-mimetype | 84
+ (2 rows)
+
+
+ pdf | https://doi.org/10.1161/circ.110.19.2977 | 2020-02-23 00:28:55.256296+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+ pdf | https://doi.org/10.1161/str.49.suppl_1.tp403 | 2020-02-23 00:27:34.950059+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+ pdf | https://doi.org/10.1161/str.49.suppl_1.tp168 | 2020-02-23 00:25:54.611271+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+ pdf | https://doi.org/10.1161/jaha.119.012131 | 2020-02-23 00:24:44.244511+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+
+ Ah, the ol' annoying 'cookieAbsent'. Works with live SPNv2 via soft-404
+ detection, but that status wasn't coming through, and needed custom
+ pdf-link detection.
+
+ FIXED: added pdf-link detection
+
+ehp.niehs.nih.gov | no-pdf-link | 5772
+
+ simple custom URL format. but are they also blocking?
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'ehp.niehs.nih.gov'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ domain | status | count
+ -------------------+----------------+-------
+ ehp.niehs.nih.gov | no-pdf-link | 5791
+ ehp.niehs.nih.gov | wrong-mimetype | 11
+ (2 rows)
+
+ FIXED: mostly just slow, custom URL seems to work
+
+journals.tsu.ru | no-pdf-link | 4404
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'journals.tsu.ru'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%journals.tsu.ru%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ -----------------+----------------+-------
+ journals.tsu.ru | no-pdf-link | 4409
+ journals.tsu.ru | success | 1
+ journals.tsu.ru | wrong-mimetype | 1
+ (3 rows)
+
+
+ pdf | https://doi.org/10.17223/18572685/57/3 | 2020-02-23 00:45:49.003593+00 | f | no-pdf-link | http://journals.tsu.ru/rusin/&journal_page=archive&id=1907&article_id=42847 | 20200213132322 | 200 |
+ pdf | https://doi.org/10.17223/17267080/71/4 | 2020-02-23 00:31:25.715416+00 | f | no-pdf-link | http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405 | 20200211151825 | 200 |
+ pdf | https://doi.org/10.17223/15617793/399/33 | 2020-02-23 00:29:45.414865+00 | f | no-pdf-link | http://journals.tsu.ru/vestnik/&journal_page=archive&id=1322&article_id=24619 | 20200208152715 | 200 |
+ pdf | https://doi.org/10.17223/19988613/58/15 | 2020-02-23 00:25:24.402838+00 | f | no-pdf-link | http://journals.tsu.ru//history/&journal_page=archive&id=1827&article_id=40501 | 20200212200320 | 200 |
+
+ FIXED: simple new custom PDF link pattern
+
+www.cogentoa.com | no-pdf-link | 4282
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.cogentoa.com'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.cogentoa.com%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ------------------+-------------+-------
+ www.cogentoa.com | no-pdf-link | 4296
+ (1 row)
+
+ pdf | https://doi.org/10.1080/23311932.2015.1022632 | 2020-02-23 01:06:14.040013+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311932.2015.1022632 | 20200208054228 | 200 |
+ pdf | https://doi.org/10.1080/23322039.2020.1730079 | 2020-02-23 01:04:53.754117+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23322039.2020.1730079 | 20200223010431 | 200 |
+ pdf | https://doi.org/10.1080/2331186x.2018.1460901 | 2020-02-23 01:04:03.47563+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/2331186X.2018.1460901 | 20200207200958 | 200 |
+ pdf | https://doi.org/10.1080/23311975.2017.1412873 | 2020-02-23 01:03:08.063545+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 | 20200209034602 | 200 |
+ pdf | https://doi.org/10.1080/23311916.2017.1293481 | 2020-02-23 01:02:42.868424+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311916.2017.1293481 | 20200208101623 | 200 |
+
+ FIXED: simple custom URL-based pattern
+
+chemrxiv.org | no-pdf-link | 4186
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'chemrxiv.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%chemrxiv.org%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ --------------+-------------------------+-------
+ chemrxiv.org | no-pdf-link | 4202
+ chemrxiv.org | wrong-mimetype | 64
+ chemrxiv.org | wayback-error | 14
+ chemrxiv.org | success | 12
+ chemrxiv.org | terminal-bad-status | 4
+ chemrxiv.org | spn2-cdx-lookup-failure | 1
+
+ pdf | https://doi.org/10.26434/chemrxiv.9912812.v1 | 2020-02-23 01:08:34.585084+00 | f | no-pdf-link | https://chemrxiv.org/articles/Proximity_Effect_in_Crystalline_Framework_Materials_Stacking-Induced_Functionality_in_MOFs_and_COFs/9912812/1 | 20200215072929 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.7150097 | 2020-02-23 01:05:48.957624+00 | f | no-pdf-link | https://chemrxiv.org/articles/Systematic_Engineering_of_a_Protein_Nanocage_for_High-Yield_Site-Specific_Modification/7150097 | 20200213002430 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.7833500.v1 | 2020-02-23 00:55:41.013109+00 | f | no-pdf-link | https://chemrxiv.org/articles/Formation_of_Neutral_Peptide_Aggregates_Studied_by_Mass_Selective_IR_Action_Spectroscopy/7833500/1 | 20200210131343 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.8146103 | 2020-02-23 00:52:00.193328+00 | f | no-pdf-link | https://chemrxiv.org/articles/On-Demand_Guest_Release_from_MOF-5_Sealed_with_Nitrophenylacetic_Acid_Photocapping_Groups/8146103 | 20200207215449 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.10101419 | 2020-02-23 00:46:14.086913+00 | f | no-pdf-link | https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419 | 20200214044153 | 200 |
+
+ FIXED: complex JSON PDF url extraction; maybe for all figshare?
+
+TODO:
+x many datacite prefixes go to IRs, but have is_oa:false. we should probably crawl by default based on release_type
+ => fatcat branch bnewbold-more-ingest
+- re-ingest all degruyter (doi_prefix:10.1515)
+ 1456169 doi:10.1515\/*
+ 89942 doi:10.1515\/* is_oa:true
+ 36350 doi:10.1515\/* in_ia:false is_oa:true
+ 1290830 publisher:Gruyter
+ 88944 publisher:Gruyter is_oa:true
+ 40034 publisher:Gruyter is_oa:true in_ia:false
+- re-ingest all frontiersin
+ 248165 publisher:frontiers
+ 161996 publisher:frontiers is_oa:true
+ 36093 publisher:frontiers is_oa:true in_ia:false
+ 121001 publisher:frontiers in_ia:false
+- re-ingest all mdpi
+ 43114 publisher:mdpi is_oa:true in_ia:false
+- re-ingest all ahajournals.org
+ 132000 doi:10.1161\/*
+ 6606 doi:10.1161\/* in_ia:false is_oa:true
+ 81349 publisher:"American Heart Association"
+ 5986 publisher:"American Heart Association" is_oa:true in_ia:false
+- re-ingest all ehp.niehs.nih.gov
+ 25522 doi:10.1289\/*
+ 15315 publisher:"Environmental Health Perspectives"
+ 8779 publisher:"Environmental Health Perspectives" in_ia:false
+ 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true
+- re-ingest all journals.tsu.ru
+ 12232 publisher:"Tomsk State University"
+ 11668 doi:10.17223\/*
+ 4861 publisher:"Tomsk State University" in_ia:false is_oa:true
+- re-ingest all www.cogentoa.com
+ 3421898 doi:10.1080\/*
+ 4602 journal:cogent is_oa:true in_ia:false
+ 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain)
+- re-ingest chemrxiv
+ 8281 doi:10.26434\/chemrxiv*
+ 6918 doi:10.26434\/chemrxiv* in_ia:false
+
+Submit all the above with limits of 1000, then follow up later to check that
+there was success?
+
diff --git a/notes/ingest/2020-02_unpaywall.md b/notes/ingest/2020-02_unpaywall.md
new file mode 100644
index 0000000..e18a2ff
--- /dev/null
+++ b/notes/ingest/2020-02_unpaywall.md
@@ -0,0 +1,624 @@
+
+## Stats and Things
+
+ zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt
+
+## Transform
+
+ zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null
+ => 22M 1:31:25 [ 4k/s]
+
+Shard it into batches of roughly 1 million (all are 1098096 +/- 1):
+
+ zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json
+
+Test ingest:
+
+ head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Add a single batch like:
+
+ cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Progress/Status
+
+There are 21,961,928 lines total, in batches of 1,098,097.
+
+ unpaywall_snapshot_2019-11-22.ingest_request.split_00.json
+ => 2020-02-24 21:05 local: 1,097,523 ~22 results/sec (combined)
+ => 2020-02-25 10:35 local: 0
+ unpaywall_snapshot_2019-11-22.ingest_request.split_01.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_02.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_03.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_04.json
+ => 2020-02-25 11:26 local: 4,388,997
+ => 2020-02-25 10:14 local: 1,115,821
+ => 2020-02-26 16:00 local: 265,116
+ unpaywall_snapshot_2019-11-22.ingest_request.split_05.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_06.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_07.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_08.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_09.json
+ => 2020-02-26 16:01 local: 6,843,708
+ => 2020-02-26 16:31 local: 4,839,618
+ => 2020-02-28 10:30 local: 2,619,319
+ unpaywall_snapshot_2019-11-22.ingest_request.split_10.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_11.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_12.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_13.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_14.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_15.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_16.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_17.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_18.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_19.json
+ => 2020-02-28 10:50 local: 13,551,887
+ => 2020-03-01 23:38 local: 4,521,076
+ => 2020-03-02 10:45 local: 2,827,071
+ => 2020-03-02 21:06 local: 1,257,176
+ added about 500k bulk re-ingest to try and work around cdx errors
+ => 2020-03-02 21:30 local: 1,733,654
+
+## Investigate Failures
+
+Guessing than some domains are ultimately going to need direct "recrawl" via
+SPNv2.
+
+ -- top domain failures for unpaywall GWB history ingest
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ watermark.silverchair.com | terminal-bad-status | 258432
+ www.tandfonline.com | no-pdf-link | 203873
+ journals.sagepub.com | no-pdf-link | 126317
+ iopscience.iop.org | terminal-bad-status | 112526
+ files-journal-api.frontiersin.org | terminal-bad-status | 112499
+ pubs.acs.org | no-pdf-link | 94772
+ www.degruyter.com | redirect-loop | 89801
+ www.ahajournals.org | no-pdf-link | 84025
+ society.kisti.re.kr | no-pdf-link | 72849
+ www.nature.com | redirect-loop | 53575
+ babel.hathitrust.org | terminal-bad-status | 41063
+ www.ncbi.nlm.nih.gov | redirect-loop | 40363
+ scialert.net | no-pdf-link | 38340
+ www.degruyter.com | terminal-bad-status | 34913
+ www.journal.csj.jp | no-pdf-link | 30881
+ espace.library.uq.edu.au | redirect-loop | 24570
+ www.jci.org | redirect-loop | 24409
+ aip.scitation.org | wrong-mimetype | 22144
+ www.vr-elibrary.de | no-pdf-link | 17436
+ www.biorxiv.org | wrong-mimetype | 15524
+ ajph.aphapublications.org | no-pdf-link | 15083
+ zookeys.pensoft.net | redirect-loop | 14867
+ dialnet.unirioja.es | redirect-loop | 14486
+ asa.scitation.org | wrong-mimetype | 14261
+ www.nrcresearchpress.com | no-pdf-link | 14254
+ dl.acm.org | redirect-loop | 14223
+ osf.io | redirect-loop | 14103
+ www.oecd-ilibrary.org | redirect-loop | 12835
+ journals.sagepub.com | redirect-loop | 12229
+ iopscience.iop.org | redirect-loop | 11825
+ (30 rows)
+
+ -- top no-capture terminal domains
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled
+
+ -- top no-capture base domains
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+------------+--------
+ academic.oup.com | no-capture | 429888
+ www.nature.com | no-capture | 273825
+ dergipark.org.tr | no-capture | 119847
+ www.biodiversitylibrary.org | no-capture | 110220
+ escholarship.org | no-capture | 106307
+ onlinelibrary.wiley.com | no-capture | 89771
+ journals.sagepub.com | no-capture | 79297
+ www.cell.com | no-capture | 64242
+ deepblue.lib.umich.edu | no-capture | 58080
+ babel.hathitrust.org | no-capture | 52286
+ hal.archives-ouvertes.fr | no-capture | 48549
+ iopscience.iop.org | no-capture | 42591
+ dash.harvard.edu | no-capture | 40767
+ www.tandfonline.com | no-capture | 40638
+ discovery.ucl.ac.uk | no-capture | 40633
+ www.jstage.jst.go.jp | no-capture | 39780
+ www.doiserbia.nb.rs | no-capture | 39261
+ dspace.mit.edu | no-capture | 37703
+ zookeys.pensoft.net | no-capture | 34562
+ repositorio.unesp.br | no-capture | 34437
+ ashpublications.org | no-capture | 34112
+ www.cambridge.org | no-capture | 33959
+ kclpure.kcl.ac.uk | no-capture | 31455
+ society.kisti.re.kr | no-capture | 30427
+ pure.mpg.de | no-capture | 27650
+ download.atlantis-press.com | no-capture | 27253
+ dialnet.unirioja.es | no-capture | 26886
+ link.springer.com | no-capture | 26257
+ www.valueinhealthjournal.com | no-capture | 24798
+ dspace.library.uu.nl | no-capture | 23234
+ (30 rows)
+
+ -- top no-capture base domains
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+------------+--------
+ academic.oup.com | no-capture | 429888
+ www.nature.com | no-capture | 273825
+ dergipark.org.tr | no-capture | 119847
+ www.biodiversitylibrary.org | no-capture | 110220
+ escholarship.org | no-capture | 106307
+ onlinelibrary.wiley.com | no-capture | 89771
+ journals.sagepub.com | no-capture | 79297
+ www.cell.com | no-capture | 64242
+ deepblue.lib.umich.edu | no-capture | 58080
+ babel.hathitrust.org | no-capture | 52286
+ hal.archives-ouvertes.fr | no-capture | 48549
+ iopscience.iop.org | no-capture | 42591
+ dash.harvard.edu | no-capture | 40767
+ www.tandfonline.com | no-capture | 40638
+ discovery.ucl.ac.uk | no-capture | 40633
+ www.jstage.jst.go.jp | no-capture | 39780
+ www.doiserbia.nb.rs | no-capture | 39261
+ dspace.mit.edu | no-capture | 37703
+ zookeys.pensoft.net | no-capture | 34562
+ repositorio.unesp.br | no-capture | 34437
+ ashpublications.org | no-capture | 34112
+ www.cambridge.org | no-capture | 33959
+ kclpure.kcl.ac.uk | no-capture | 31455
+ society.kisti.re.kr | no-capture | 30427
+ pure.mpg.de | no-capture | 27650
+ download.atlantis-press.com | no-capture | 27253
+ dialnet.unirioja.es | no-capture | 26886
+ link.springer.com | no-capture | 26257
+ www.valueinhealthjournal.com | no-capture | 24798
+ dspace.library.uu.nl | no-capture | 23234
+ (30 rows)
+
+ -- how many ingest requests not crawled at all?
+ SELECT count(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status IS NULL;
+ => 0
+
+ -- "cookie absent" terminal pages, by domain
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ --------------------------------+----------------+--------
+ journals.sagepub.com | no-pdf-link | 126295
+ www.tandfonline.com | no-pdf-link | 116690
+ pubs.acs.org | no-pdf-link | 94619
+ www.ahajournals.org | no-pdf-link | 84016
+ www.journal.csj.jp | no-pdf-link | 30881
+ aip.scitation.org | wrong-mimetype | 22143
+ www.vr-elibrary.de | no-pdf-link | 17436
+ ajph.aphapublications.org | no-pdf-link | 15080
+ asa.scitation.org | wrong-mimetype | 14261
+ www.nrcresearchpress.com | no-pdf-link | 14253
+ journals.ametsoc.org | no-pdf-link | 10500
+ www.journals.uchicago.edu | no-pdf-link | 6917
+ www.icevirtuallibrary.com | no-pdf-link | 6484
+ www.journals.uchicago.edu | wrong-mimetype | 6191
+ www.healthaffairs.org | no-pdf-link | 5732
+ pubsonline.informs.org | no-pdf-link | 5672
+ pinnacle-secure.allenpress.com | no-pdf-link | 5013
+ www.worldscientific.com | no-pdf-link | 4560
+ www.ajronline.org | wrong-mimetype | 4523
+ ehp.niehs.nih.gov | no-pdf-link | 4514
+ www.future-science.com | no-pdf-link | 4091
+ pubs.acs.org | wrong-mimetype | 4015
+ aip.scitation.org | no-pdf-link | 3916
+ www.futuremedicine.com | no-pdf-link | 3821
+ asa.scitation.org | no-pdf-link | 3644
+ www.liebertpub.com | no-pdf-link | 3345
+ physicstoday.scitation.org | no-pdf-link | 3005
+ pubs.cif-ifc.org | no-pdf-link | 2761
+ epubs.siam.org | wrong-mimetype | 2583
+ www.ajronline.org | no-pdf-link | 2563
+ (30 rows)
+
+ -- "cookie absent" terminal pages, by domain
+ SELECT count(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent';
+
+ => 654885
+
+ -- NOT "cookie absent" terminal page failures, total count
+ SELECT count(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent';
+
+ => 1403837
+
+Looks like these domains are almost all "cookieAbsent" blocking:
+- journals.sagepub.com
+- pubs.acs.org
+- ahajournals.org
+- www.journal.csj.jp
+- aip.scitation.org
+
+Grab some individual URLs to test:
+
+ SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ORDER BY updated DESC
+ LIMIT 25;
+
+NOT cookieAbsent testing with regular ingest tool:
+- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success
+- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes
+- osf.io success
+
+ SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ORDER BY updated DESC
+ LIMIT 25;
+
+cookieAbsent testing with regular ingest tool:
+- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works
+
+The main distinguisher is status. terminal-bad-status can be ingested (live)
+successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled.
+
+## Heritrix Plan
+
+Generate following ingest request batches:
+
+- no-capture status from unpaywall
+- all other failures except /cookieAbsent
+- /cookieAbsent failures
+
+Plan will be to crawl no-capture first (to completion), then try the other
+non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2.
+
+Because there are so few "no-capture on second hop" cases, will not enqueue
+both terminal urls and base urls, only base urls.
+
+Should definitely skip/filter:
+
+- www.ncbi.nlm.nih.gov
+
+## Ingest Request Export
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json';
+ => 4,855,142
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json';
+ => 1,403,837
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json
+
+Note: will probably end up re-running the below after crawling+ingesting the above:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.status = 'terminal-bad-status'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json';
+ => 0
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.status != 'terminal-bad-status'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
+ => 654,885
+
+## Batch Ingest
+
+Test small batch:
+
+ head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full batch:
+
+ cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ # there was a broken line in there, so...
+ # parse error: Expected separator between values at line 1367873, column 175
+ # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null
+ tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Note that the crawl is not entirely complete and not all CDX seem to have been
+loaded, so may need to iterate. About 10% are still "no capture". May want or
+need to additionally crawl the terminal URLs, not the base URLs.
+
+## Post-ingest stats
+
+Overall status:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 17354494
+ no-pdf-link | 1471076
+ no-capture | 1135992
+ redirect-loop | 837842
+ terminal-bad-status | 803081
+ cdx-error | 219746
+ wrong-mimetype | 100723
+ link-loop | 16013
+ wayback-error | 12448
+ null-body | 9444
+ redirects-exceeded | 600
+ petabox-error | 411
+ bad-redirect | 17
+ bad-gzip-encoding | 4
+ spn2-cdx-lookup-failure | 3
+ gateway-timeout | 1
+ spn2-error:job-failed | 1
+ spn2-error | 1
+ (18 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ academic.oup.com | no-pdf-link | 330211
+ watermark.silverchair.com | terminal-bad-status | 324599
+ www.tandfonline.com | no-pdf-link | 242724
+ journals.sagepub.com | no-pdf-link | 202050
+ iopscience.iop.org | terminal-bad-status | 144063
+ files-journal-api.frontiersin.org | terminal-bad-status | 121719
+ pubs.acs.org | no-pdf-link | 104535
+ www.ahajournals.org | no-pdf-link | 102653
+ society.kisti.re.kr | no-pdf-link | 101787
+ www.degruyter.com | redirect-loop | 95130
+ www.nature.com | redirect-loop | 87534
+ onlinelibrary.wiley.com | no-pdf-link | 84432
+ www.cell.com | redirect-loop | 61496
+ www.degruyter.com | terminal-bad-status | 42919
+ babel.hathitrust.org | terminal-bad-status | 41813
+ www.ncbi.nlm.nih.gov | redirect-loop | 40488
+ scialert.net | no-pdf-link | 38341
+ ashpublications.org | no-pdf-link | 34889
+ dialnet.unirioja.es | terminal-bad-status | 32076
+ www.journal.csj.jp | no-pdf-link | 30881
+ pure.mpg.de | redirect-loop | 26163
+ www.jci.org | redirect-loop | 24701
+ espace.library.uq.edu.au | redirect-loop | 24591
+ www.valueinhealthjournal.com | redirect-loop | 23740
+ www.vr-elibrary.de | no-pdf-link | 23332
+ aip.scitation.org | wrong-mimetype | 22144
+ osf.io | redirect-loop | 18513
+ www.journals.elsevier.com | no-pdf-link | 16710
+ www.spandidos-publications.com | redirect-loop | 15711
+ www.biorxiv.org | wrong-mimetype | 15513
+ (30 rows)
+
+Dump lists for another iteration of bulk ingest:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json';
+ => 278,876
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json';
+ =>
+
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json
+
+ cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-03-02_ingests.txt b/notes/ingest/2020-03-02_ingests.txt
new file mode 100644
index 0000000..e98ef33
--- /dev/null
+++ b/notes/ingest/2020-03-02_ingests.txt
@@ -0,0 +1,174 @@
+
+## protocols.io
+
+Tested that single ingest is working, and they fixed PDF format on their end
+recently.
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --name protocols.io
+ => Expecting 8448 release objects in search queries
+ => Counter({'estimate': 8448, 'kafka': 8448, 'ingest_request': 8448, 'elasticsearch_release': 8448})
+
+## backfill follow-ups
+
+- re-ingest all degruyter (doi_prefix:10.1515)
+ 89942 doi:10.1515\/* is_oa:true
+ 36350 doi:10.1515\/* in_ia:false is_oa:true
+ 40034 publisher:Gruyter is_oa:true in_ia:false
+ => update:
+ 135926 doi:10.1515\/* is_oa:true
+ 50544 doi:10.1515\/* in_ia:false is_oa:true
+ 54880 publisher:Gruyter is_oa:true in_ia:false
+- re-ingest all frontiersin
+ 36093 publisher:frontiers is_oa:true in_ia:false
+ => update
+ 22444 publisher:frontiers is_oa:true in_ia:false
+ 22029 doi_prefix:10.3389 is_oa:true in_ia:false
+
+ select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3389/%' group by status order by count(*) desc;
+
+ status | count
+ -------------------------------------+-------
+ success | 34721
+ no-pdf-link | 18157
+ terminal-bad-status | 6799
+ cdx-error | 1805
+ wayback-error | 333
+ no-capture | 301
+ [...]
+
+ select * from ingest_file_result where base_url like 'https://doi.org/10.17723/aarc%' and status = 'no-pdf-link' order by updated desc limit 100;
+
+- re-ingest all mdpi
+ 43114 publisher:mdpi is_oa:true in_ia:false
+ => update
+ 8548 publisher:mdpi is_oa:true in_ia:false
+
+ select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3390/%' group by status order by count(*) desc;
+ status | count
+ -------------------------------------+--------
+ success | 108971
+ cdx-error | 6655
+ wrong-mimetype | 3359
+ terminal-bad-status | 1299
+ wayback-error | 151
+ spn2-cdx-lookup-failure | 87
+
+ => added hack for gzip content-encoding coming through pdf fetch
+ => will re-ingest all after pushing fix
+
+- re-ingest all ahajournals.org
+ 132000 doi:10.1161\/*
+ 6606 doi:10.1161\/* in_ia:false is_oa:true
+ 81349 publisher:"American Heart Association"
+ 5986 publisher:"American Heart Association" is_oa:true in_ia:false
+ => update
+ 1337 publisher:"American Heart Association" is_oa:true in_ia:false
+
+ status | count
+ -------------------------------------+-------
+ success | 1480
+ cdx-error | 1176
+ spn2-cdx-lookup-failure | 514
+ no-pdf-link | 85
+ wayback-error | 25
+ spn2-error:job-failed | 18
+
+ => will re-run errors
+- re-ingest all ehp.niehs.nih.gov
+ 25522 doi:10.1289\/*
+ 15315 publisher:"Environmental Health Perspectives"
+ 8779 publisher:"Environmental Health Perspectives" in_ia:false
+ 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true
+ => update
+ 7547 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true
+- re-ingest all journals.tsu.ru
+ 12232 publisher:"Tomsk State University"
+ 11668 doi:10.17223\/*
+ 4861 publisher:"Tomsk State University" in_ia:false is_oa:true
+ => update
+ 2605 publisher:"Tomsk State University" in_ia:false is_oa:true
+ => just need to retry these? seem fine
+- re-ingest all www.cogentoa.com
+ 3421898 doi:10.1080\/*
+ 4602 journal:cogent is_oa:true in_ia:false
+ 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain)
+ => update
+ 254 journal:cogent is_oa:true in_ia:false
+- re-ingest chemrxiv
+ 8281 doi:10.26434\/chemrxiv*
+ 6918 doi:10.26434\/chemrxiv* in_ia:false
+ => update
+ 4890 doi:10.26434\/chemrxiv* in_ia:false
+ => re-ingest
+ => allow non-OA
+
+ # american archivist
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911})
+ => 2020-02-04: 85 / 3,005
+ => 2020-03-02: 2,182 / 3,005 preserved. some no-pdf-link, otherwise just a bunch of spn2-error
+ => looks like the no-pdf-url due to pinnacle-secure.allenpress.com soft-blocking loop
+
+
+## backfill re-ingests
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl container --container-id zpobyv4vbranllc7oob56tgci4
+ => Counter({'elasticsearch_release': 823, 'estimate': 823, 'ingest_request': 814, 'kafka': 814})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter
+ => Counter({'elasticsearch_release': 54880, 'estimate': 54880, 'kafka': 51497, 'ingest_request': 51497})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query 'publisher:"Tomsk State University"'
+ => Counter({'ingest_request': 2605, 'kafka': 2605, 'elasticsearch_release': 2605, 'estimate': 2605})
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*"
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi
+ => Counter({'estimate': 8548, 'elasticsearch_release': 8548, 'ingest_request': 6693, 'kafka': 6693})
+ => NOTE: about 2k not enqueued
+
+## re-ingest all broken
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-%'
+ ) TO '/grande/snapshots/reingest_spn2_20200302.rows.json';
+ => COPY 14849
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ ) TO '/grande/snapshots/reingest_cdxerr_20200302.rows.json';
+ => COPY 507610
+
+ This is a huge number! Re-ingest via bulk?
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2_20200302.rows.json > reingest_spn2_20200302.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdxerr_20200302.rows.json > reingest_cdxerr_20200302.json
+
+Push to kafka:
+
+ cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ # accidentially also piped the above through ingest-file-requests-bulk...
+ # which could actually be bad
+ cat reingest_cdxerr_20200302.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## biorxiv/medrxiv
+
+ 8026 doi:10.1101\/20*
+ 2159 doi:10.1101\/20* in_ia:false
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 'doi:10.1101\/20* in_ia:false'
+ => Counter({'estimate': 2159, 'ingest_request': 2159, 'elasticsearch_release': 2159, 'kafka': 2159})
+
diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md
new file mode 100644
index 0000000..73396bd
--- /dev/null
+++ b/notes/ingest/2020-03-oa_but_not_marked.md
@@ -0,0 +1,25 @@
+
+These are large journals with a high fraction of "in IA", but not marked as OA
+so not crawling regularly.
+
+TODO: add things like list of unpaywall ISSN / OA status to try and find more
+"practical" / bronze OA
+
+## First Run
+
+https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
+https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
+https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
+https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
+https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
+https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
+
+## TODO
+
+https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
+https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
+
+https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
+https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
+https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
+https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
diff --git a/notes/ingest/2020-03_mag.md b/notes/ingest/2020-03_mag.md
new file mode 100644
index 0000000..428ce05
--- /dev/null
+++ b/notes/ingest/2020-03_mag.md
@@ -0,0 +1,576 @@
+
+Rough plan:
+
+- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
+- persist ingest requests (22 million or so)
+- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
+- crawl those which are no-capture
+
+
+## Generate Requests
+
+Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
+installed.
+
+Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
+(including DOI `ext_id` when available), with any dominant domains removed (eg,
+arxiv.org):
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
+ => previously 25.6M
+ => 25.6M 2:29:43 [2.85k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
+ => 4.3M 0:25:45 [2.78k/s]
+
+ export LC_ALL=C
+ cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
+
+ zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
+ => 6,504,907
+
+ zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
+ => 4,369,832
+
+ cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
+ => previously 15,707,405
+ => 15,702,581
+
+ cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
+ => 0
+ URL encoding seems to be working
+
+## Persist Ingest Requests
+
+First pmid ingest requests, then the all/doi file. The reason to do this order
+is that the all/doi file will have some rows with no DOI (and thus no
+`ext_id`), while the PMID file will not.
+
+ # small sample
+ head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 10, 'skip-result-fields': 10})
+ JSON lines pushed: Counter({'total': 10, 'pushed': 10})
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ => 4.3M 0:16:46 [4.27k/s]
+ Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
+ JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
+ => hit a bug on first attempt, which is why total/insert results don't match
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
+ => 25.6M 2:21:54 [3.01k/s]
+ Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
+ JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
+
+
+## Crawl/Dupe Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+After just PMID links:
+
+ status | count
+ ---------------------+---------
+ | 3000115
+ success | 1126881
+ no-capture | 69459
+ terminal-bad-status | 30259
+ redirect-loop | 11656
+ no-pdf-link | 2836
+ wrong-mimetype | 1456
+ link-loop | 1259
+ wayback-error | 1232
+ cdx-error | 932
+ null-body | 85
+ petabox-error | 50
+ bad-redirect | 1
+ (13 rows)
+
+After all links:
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag';
+ => 25596563
+
+
+ status | count
+ ---------------------+----------
+ | 21130841
+ success | 3915682
+ no-capture | 391813
+ terminal-bad-status | 76488
+ redirect-loop | 44202
+ wrong-mimetype | 16418
+ no-pdf-link | 10995
+ wayback-error | 3679
+ cdx-error | 3414
+ link-loop | 2098
+ null-body | 709
+ petabox-error | 221
+ bad-gzip-encoding | 2
+ bad-redirect | 1
+ (14 rows)
+
+Somewhat more un-ingested than expected.
+
+Dump requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
+ => COPY 21,130,841
+
+Transform and shuf:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
+ => 21.1M 0:18:57 [18.6k/s]
+
+## Bulk Ingest Partner Output
+
+These are subsets of the full list from potential AIT-S partners; want to run
+these through the pipeline before the full batch. Duplication against the full
+batch should be minimal.
+
+Size:
+
+ bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
+ 29007
+ bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
+ 34265 ingest_requests_mag-2020-01-23.cornell.json
+
+Test ingest:
+
+ head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full ingests:
+
+ cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Bulk Ingest
+
+Shard it into batches of roughly 1 million:
+
+ cd /grande/snapshots/
+ zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
+
+Add a single batch like:
+
+ cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ partner ingests (see above)
+ => 2020-03-05 12:49: 118,396
+ 1056543 mag_noingest_20200305.ingest_request.split_00.json
+ => 2020-03-05 14:34: 1,055,224
+ => check on stats/ratios; filter by ingest update time?
+ 1056542 mag_noingest_20200305.ingest_request.split_01.json
+ 1056542 mag_noingest_20200305.ingest_request.split_02.json
+ 1056542 mag_noingest_20200305.ingest_request.split_03.json
+ 1056542 mag_noingest_20200305.ingest_request.split_04.json
+ 1056542 mag_noingest_20200305.ingest_request.split_05.json
+ 1056542 mag_noingest_20200305.ingest_request.split_06.json
+ 1056542 mag_noingest_20200305.ingest_request.split_07.json
+ 1056542 mag_noingest_20200305.ingest_request.split_08.json
+ 1056542 mag_noingest_20200305.ingest_request.split_09.json
+ => 2020-03-05 18:04: 10,009,297
+ => 2020-03-06 16:53: 6,553,946
+ 1056542 mag_noingest_20200305.ingest_request.split_10.json
+ 1056542 mag_noingest_20200305.ingest_request.split_11.json
+ 1056542 mag_noingest_20200305.ingest_request.split_12.json
+ 1056542 mag_noingest_20200305.ingest_request.split_13.json
+ 1056542 mag_noingest_20200305.ingest_request.split_14.json
+ 1056542 mag_noingest_20200305.ingest_request.split_15.json
+ 1056542 mag_noingest_20200305.ingest_request.split_16.json
+ 1056542 mag_noingest_20200305.ingest_request.split_17.json
+ 1056542 mag_noingest_20200305.ingest_request.split_18.json
+ 1056542 mag_noingest_20200305.ingest_request.split_19.json
+ => 2020-03-06 16:59: 17,001,032
+
+Stats from bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ ---------------------+----------
+ no-capture | 12237193
+ success | 11991293
+ no-pdf-link | 521691
+ redirect-loop | 437192
+ terminal-bad-status | 231181
+ link-loop | 92633
+ cdx-error | 33631
+ wrong-mimetype | 28638
+ wayback-error | 19651
+ null-body | 2682
+ petabox-error | 727
+ | 47
+ bad-redirect | 44
+ bad-gzip-encoding | 7
+ (14 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ --------------------------------------+---------------------+--------
+ dialnet.unirioja.es | redirect-loop | 240967
+ onlinelibrary.wiley.com | no-pdf-link | 147696
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ iopscience.iop.org | terminal-bad-status | 69591
+ febs.onlinelibrary.wiley.com | no-pdf-link | 49874
+ www.researchgate.net | redirect-loop | 42859
+ journals.sagepub.com | no-pdf-link | 27448
+ papers.ssrn.com | redirect-loop | 27328
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 20232
+ science.sciencemag.org | link-loop | 17811
+ espace.library.uq.edu.au | redirect-loop | 17185
+ bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301
+ anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746
+ www.tandfonline.com | no-pdf-link | 13303
+ aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070
+ link.springer.com | redirect-loop | 10594
+ www.redalyc.org:9081 | no-pdf-link | 10515
+ watermark.silverchair.com | terminal-bad-status | 9739
+ www.bmj.com | link-loop | 9389
+ www.repository.naturalis.nl | redirect-loop | 8213
+ bjp.rcpsych.org | link-loop | 8045
+ aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814
+ nph.onlinelibrary.wiley.com | no-pdf-link | 7801
+ iopscience.iop.org | redirect-loop | 7697
+ journals.tubitak.gov.tr | wrong-mimetype | 7159
+ www.biorxiv.org | wrong-mimetype | 7067
+ www.erudit.org | redirect-loop | 6819
+ besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254
+ (30 rows)
+
+Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
+- dialnet.unirioja.es | redirect-loop | 240967
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+
+The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
+
+Top uncrawled domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------+------------+--------
+ ieeexplore.ieee.org | no-capture | 957835
+ link.springer.com | no-capture | 394121
+ www.researchgate.net | no-capture | 376974
+ cyberleninka.ru | no-capture | 376012
+ iopscience.iop.org | no-capture | 348791
+ papers.ssrn.com | no-capture | 286860
+ dergipark.org.tr | no-capture | 217556
+ dialnet.unirioja.es | no-capture | 214398
+ academic.oup.com | no-capture | 212364
+ www.tandfonline.com | no-capture | 148940
+ journals.sagepub.com | no-capture | 144695
+ www.papersearch.net | no-capture | 138986
+ absimage.aps.org | no-capture | 111976
+ apps.dtic.mil | no-capture | 106984
+ www.cambridge.org | no-capture | 97533
+ www.bmj.com | no-capture | 92437
+ bioone.org | no-capture | 87573
+ science.sciencemag.org | no-capture | 75723
+ shodhganga.inflibnet.ac.in:8080 | no-capture | 75395
+ www.jstor.org | no-capture | 73230
+ works.bepress.com | no-capture | 68747
+ www.scielo.org.co | no-capture | 59650
+ hrcak.srce.hr | no-capture | 59332
+ muse.jhu.edu | no-capture | 57828
+ onlinelibrary.wiley.com | no-capture | 55621
+ www.jbc.org | no-capture | 54608
+ www.jstage.jst.go.jp | no-capture | 53631
+ www.redalyc.org | no-capture | 50406
+ lup.lub.lu.se | no-capture | 47469
+ www.dtic.mil | no-capture | 41820
+ (30 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
+ => COPY 11714199
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json
+
+## Bulk Ingest of Heritrix Content
+
+Small sample:
+
+ head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ 2020-04-07 12:19 (pacific): 11,703,871
+
+## Post-bulk-ingest
+
+Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
+to re-try things like cdx-error.
+
+Current status:
+
+ status | count
+ -------------------------------+----------
+ success | 18491799
+ redirect-loop | 1968530
+ no-capture | 1373657
+ no-pdf-link | 1311842
+ link-loop | 1296439
+ terminal-bad-status | 627577
+ cdx-error | 418278
+ wrong-mimetype | 50141
+ wayback-error | 37159
+ petabox-error | 11249
+ null-body | 6295
+ gateway-timeout | 3051
+ spn2-cdx-lookup-failure | 328
+ spn2-error:invalid-url-syntax | 93
+ bad-redirect | 75
+ | 47
+ invalid-host-resolution | 28
+ spn2-error | 10
+ bad-gzip-encoding | 7
+ redirects-exceeded | 2
+ (20 rows)
+
+Lots of cdx-error to retry.
+
+The no-capture links are probably a mix of domain-blocklist and things that
+failed in bulk mode. Will dump and re-attempt them:
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
+ => 859849
+
+What domains are these?
+
+ cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
+
+Let's filter down more:
+
+ cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
+
+ wc -l mag_nocapture_20200420.rows.filtered.json
+ 423085 mag_nocapture_20200420.rows.filtered.json
+
+Ok, enqueue!
+
+ cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Final Stats
+
+... for this round of ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+----------
+ success | 18712849
+ redirect-loop | 2008110
+ no-pdf-link | 1337012
+ link-loop | 1326761
+ no-capture | 1030693
+ terminal-bad-status | 637143
+ gateway-timeout | 193194
+ cdx-error | 125907
+ spn2-cdx-lookup-failure | 77842
+ wrong-mimetype | 50882
+ wayback-error | 40278
+ invalid-host-resolution | 35201
+ petabox-error | 11254
+ null-body | 6485
+ spn2-error | 1643
+ spn2-error:job-failed | 747
+ spn2-error:invalid-url-syntax | 325
+ spn2-error:soft-time-limit-exceeded | 190
+ bad-redirect | 77
+ | 47
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ domain | status | count
+ ---------------------------------+---------------------+--------
+ ieeexplore.ieee.org | redirect-loop | 677712
+ cyberleninka.ru | link-loop | 308390
+ papers.ssrn.com | link-loop | 281804
+ ieeexplore.ieee.org | link-loop | 273559
+ dialnet.unirioja.es | redirect-loop | 240504
+ dialnet.unirioja.es | terminal-bad-status | 232481
+ onlinelibrary.wiley.com | no-pdf-link | 220932
+ iopscience.iop.org | terminal-bad-status | 172480
+ validate.perfdrive.com | no-pdf-link | 172312
+ link.springer.com | redirect-loop | 130398
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382
+ iopscience.iop.org | redirect-loop | 105234
+ www.bmj.com | link-loop | 100354
+ www.researchgate.net | redirect-loop | 84366
+ www.cambridge.org | link-loop | 83171
+ jamanetwork.com | no-pdf-link | 75053
+ febs.onlinelibrary.wiley.com | no-pdf-link | 74872
+ www.jstor.org | redirect-loop | 72059
+ journals.sagepub.com | no-pdf-link | 63028
+ science.sciencemag.org | redirect-loop | 62927
+ profile.thieme.de | no-pdf-link | 62406
+ cyberleninka.ru | redirect-loop | 56733
+ link.springer.com | link-loop | 47608
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 30180
+ science.sciencemag.org | link-loop | 29908
+ papers.ssrn.com | redirect-loop | 27255
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789
+ www.computer.org | no-pdf-link | 26444
+ watermark.silverchair.com | terminal-bad-status | 25934
+ www.nature.com | redirect-loop | 25306
+ (30 rows)
diff --git a/notes/ingest/2020-03_s2.md b/notes/ingest/2020-03_s2.md
new file mode 100644
index 0000000..fedaba0
--- /dev/null
+++ b/notes/ingest/2020-03_s2.md
@@ -0,0 +1,35 @@
+
+Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
+ingested, as well as any previous existing content.
+
+Also, there are a bunch of PDF outlinks to the web; should do S2-specific
+matching and ingest of those.
+
+There are a few categories of paper from pdfs.s.o:
+
+1. we had previous GWB crawl, didn't re-crawl
+2. we had PDF from elsewhere on the web, didn't re-crawl
+3. crawled successfully
+4. crawl failed
+
+In this ingest, want to get all of categories 1 and 3. Could try to do this by
+dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
+and join that against the ingest request list.
+
+For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
+
+## Create Seedlist
+
+ zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
+ zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
+
+ zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
+ zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
+
+ zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
+ zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
+
+ zcat s2_external_ingestrequest.json.gz | wc -l
+ 41201427
+ zcat s2_hosted_ingestrequest.json.gz | wc -l
+ 23345761
diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md
new file mode 100644
index 0000000..b442d69
--- /dev/null
+++ b/notes/ingest/2020-04-13_covid19.md
@@ -0,0 +1,73 @@
+
+Want to ensure seedlists from Wanfang and CNKI are captured in wayback.
+
+Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2.
+They are heterogenous after redirect.
+
+CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the
+info ones probably can, then crawl on to PDF? At least some seem to capture Ok.
+
+Need scope and identifiers for ingest requests. Let's do:
+
+ cnki_covid19 / <ident>
+ wanfang_covid19 / <ident>
+
+Source: scrape-covid19
+
+## Commands
+
+ # in sandcrawler pipenv
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json
+
+
+ cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4
+ cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8
+
+## Status
+
+ SELECT ingest_request.ingest_type,
+ ingest_file_result.status,
+ COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.status
+ ORDER BY COUNT(*) DESC;
+
+2020-04-15:
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | spn2-cdx-lookup-failure | 1588
+ pdf | success | 671
+ pdf | gateway-timeout | 507
+ pdf | no-pdf-link | 181
+ pdf | wayback-error | 30
+ pdf | spn2-error:job-failed | 20
+ pdf | spn2-error | 7
+ pdf | spn2-error:soft-time-limit-exceeded | 3
+ pdf | spn2-error:pending | 2
+ (9 rows)
+
+## Re-Try
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status != 'no-pdf-link'
+ AND ingest_file_result.status != 'link-loop'
+ ) TO '/grande/snapshots/reingest_covid19.rows.json';
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json
+
+ cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9
+
diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md
new file mode 100644
index 0000000..0fc7e67
--- /dev/null
+++ b/notes/ingest/2020-04_datacite.md
@@ -0,0 +1,121 @@
+
+After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
+of the DOIs are for, eg, datasets, and don't want to waste time on those.
+
+Instead of using full ingest request file from the crawl, will generate a new
+ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
+
+## Generate Requests
+
+ ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
+ => Expecting 8905453 release objects in search queries
+ => 8.91M 11:49:50 [ 209 /s]
+ => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
+
+## Bulk Ingest
+
+ cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Ingest Stats
+
+Note that this will have a small fraction of non-datacite results mixed in (eg,
+from COVID-19 targeted crawls):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 4646767
+ redirect-loop | 1447229
+ no-capture | 860235
+ success | 849501
+ terminal-bad-status | 174869
+ cdx-error | 159805
+ wayback-error | 18076
+ wrong-mimetype | 11169
+ link-loop | 8410
+ gateway-timeout | 4034
+ spn2-cdx-lookup-failure | 510
+ petabox-error | 339
+ null-body | 251
+ spn2-error | 19
+ spn2-error:job-failed | 14
+ bad-gzip-encoding | 13
+ timeout | 5
+ spn2-error:soft-time-limit-exceeded | 4
+ invalid-host-resolution | 2
+ spn2-error:pending | 1
+ (20 rows)
+
+Top domains/statuses (including success):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------------+---------------------+--------
+ ssl.fao.org | no-pdf-link | 862277
+ www.e-periodica.ch | no-pdf-link | 746781
+ www.researchgate.net | redirect-loop | 664524
+ dlc.library.columbia.edu | no-pdf-link | 493111
+ www.die-bonn.de | redirect-loop | 352903
+ figshare.com | no-pdf-link | 319709
+ statisticaldatasets.data-planet.com | no-pdf-link | 309584
+ catalog.paradisec.org.au | redirect-loop | 225396
+ zenodo.org | no-capture | 193201
+ digi.ub.uni-heidelberg.de | no-pdf-link | 184974
+ open.library.ubc.ca | no-pdf-link | 167841
+ zenodo.org | no-pdf-link | 130617
+ www.google.com | no-pdf-link | 111312
+ www.e-manuscripta.ch | no-pdf-link | 79192
+ ds.iris.edu | no-pdf-link | 77649
+ data.inra.fr | no-pdf-link | 69440
+ www.tib.eu | no-pdf-link | 63872
+ www.egms.de | redirect-loop | 53877
+ archaeologydataservice.ac.uk | redirect-loop | 52838
+ d.lib.msu.edu | no-pdf-link | 45297
+ www.e-rara.ch | no-pdf-link | 45163
+ springernature.figshare.com | no-pdf-link | 42527
+ boris.unibe.ch | no-pdf-link | 40816
+ www.research-collection.ethz.ch | no-capture | 40350
+ spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059
+ repository.dri.ie | terminal-bad-status | 32760
+ othes.univie.ac.at | no-pdf-link | 32558
+ repositories.lib.utexas.edu | no-capture | 31526
+ posterng.netkey.at | no-pdf-link | 30315
+ zenodo.org | terminal-bad-status | 29614
+ (30 rows)
+
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md
new file mode 100644
index 0000000..a5e3bb1
--- /dev/null
+++ b/notes/ingest/2020-04_unpaywall.md
@@ -0,0 +1,312 @@
+
+A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
+not released for more than a month).
+
+Primary goal is:
+
+- generate ingest requests for only *new* URLs
+- bulk ingest these new URLs
+- crawl any no-capture URLs from that batch
+- re-bulk-ingest the no-capture batch
+- analytics on failed ingests. eg, any particular domains that are failing to crawl
+
+This ingest pipeline was started on 2020-04-07 by bnewbold.
+
+Ran through the first two steps again on 2020-05-03 after unpaywall had
+released another dump (dated 2020-04-27).
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
+ => 24.7M 5:17:03 [ 1.3k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 24.7M
+ => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
+
+Second time:
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json
+ => 25.2M 3:16:28 [2.14k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390})
+
+
+## Dump new URLs and Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
+ => 3696189
+
+ WARNING: forgot to transform from rows to ingest requests.
+
+ cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Second time:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';
+ => 1799760
+
+ WARNING: forgot to transform from rows to ingest requests.
+
+ cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Dump no-capture, Run Crawl
+
+Make two ingest request dumps: one with "all" URLs, which we will have heritrix
+attempt to crawl, and then one with certain domains filtered out, which we may
+or may not bother trying to ingest (due to expectation of failure).
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json';
+ => 2734145
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+ => 2602408
+
+NOTE: forgot here to transform from "rows" to ingest requests.
+
+Not actually a very significant size difference after all.
+
+See `journal-crawls` repo for details on seedlist generation and crawling.
+
+## Re-Ingest Post-Crawl
+
+NOTE: if we *do* want to do cleanup eventually, could look for fatcat edits
+between 2020-04-01 and 2020-05-25 which have limited "extra" metadata (eg, no
+evidence or `oa_status`).
+
+The earlier bulk ingests were done wrong (forgot to transform from rows to full
+ingest request docs), so going to re-do those, which should be a superset of
+the nocapture crawl URLs.:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-04-08.json
+ => 1.26M 0:00:58 [21.5k/s]
+ => previously: 3,696,189
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-05-03.json
+ => 1.26M 0:00:56 [22.3k/s]
+
+Crap, looks like the 2020-04-08 segment got overwriten with 2020-05 data by
+accident. Hrm... need to re-ingest *all* recent unpaywall URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ ) TO '/grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json';
+ => COPY 5691106
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json
+ => 5.69M 0:04:26 [21.3k/s]
+
+Start small:
+
+ cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks good (whew), run the full thing:
+
+ cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Post-ingest stats (2020-08-28)
+
+Overall status:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 22063013
+ no-pdf-link | 2192606
+ redirect-loop | 1471135
+ terminal-bad-status | 995106
+ no-capture | 359440
+ cdx-error | 358909
+ wrong-mimetype | 111685
+ wayback-error | 50705
+ link-loop | 29359
+ null-body | 13667
+ gateway-timeout | 3689
+ spn2-cdx-lookup-failure | 1229
+ petabox-error | 1007
+ redirects-exceeded | 747
+ invalid-host-resolution | 464
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ bad-redirect | 26
+ spn2-error:soft-time-limit-exceeded | 9
+ bad-gzip-encoding | 5
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ academic.oup.com | no-pdf-link | 415441
+ watermark.silverchair.com | terminal-bad-status | 345937
+ www.tandfonline.com | no-pdf-link | 262488
+ journals.sagepub.com | no-pdf-link | 235707
+ onlinelibrary.wiley.com | no-pdf-link | 225876
+ iopscience.iop.org | terminal-bad-status | 170783
+ www.nature.com | redirect-loop | 145522
+ www.degruyter.com | redirect-loop | 131898
+ files-journal-api.frontiersin.org | terminal-bad-status | 126091
+ pubs.acs.org | no-pdf-link | 119223
+ society.kisti.re.kr | no-pdf-link | 112401
+ www.ahajournals.org | no-pdf-link | 105953
+ dialnet.unirioja.es | terminal-bad-status | 96505
+ www.cell.com | redirect-loop | 87560
+ www.ncbi.nlm.nih.gov | redirect-loop | 49890
+ ageconsearch.umn.edu | redirect-loop | 45989
+ ashpublications.org | no-pdf-link | 45833
+ pure.mpg.de | redirect-loop | 45278
+ www.degruyter.com | terminal-bad-status | 43642
+ babel.hathitrust.org | terminal-bad-status | 42057
+ osf.io | redirect-loop | 41119
+ scialert.net | no-pdf-link | 39009
+ dialnet.unirioja.es | redirect-loop | 38839
+ www.jci.org | redirect-loop | 34209
+ www.spandidos-publications.com | redirect-loop | 33167
+ www.journal.csj.jp | no-pdf-link | 30915
+ journals.openedition.org | redirect-loop | 30409
+ www.valueinhealthjournal.com | redirect-loop | 30090
+ dergipark.org.tr | no-pdf-link | 29146
+ journals.ametsoc.org | no-pdf-link | 29133
+ (30 rows)
+
+Enqueue internal failures for re-ingest:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/unpaywall_errors_2020-08-28.rows.json';
+ => 409606
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_errors_2020-08-28.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_errors_2020-08-28.requests.json
+
+ cat /grande/snapshots/unpaywall_errors_2020-08-28.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+And after *that* (which ran quickly):
+
+ status | count
+ -------------------------------------+----------
+ success | 22281874
+ no-pdf-link | 2258352
+ redirect-loop | 1499251
+ terminal-bad-status | 1004781
+ no-capture | 401333
+ wrong-mimetype | 112068
+ cdx-error | 32259
+ link-loop | 30137
+ null-body | 13886
+ wayback-error | 11653
+ gateway-timeout | 3689
+ spn2-cdx-lookup-failure | 1229
+ petabox-error | 1036
+ redirects-exceeded | 749
+ invalid-host-resolution | 464
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ bad-redirect | 26
+ spn2-error:soft-time-limit-exceeded | 9
+ bad-gzip-encoding | 5
+ (20 rows)
+
+22063013 -> 22281874 = + 218,861 success, not bad!
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..fe22c75
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,428 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+ => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+ zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+ => 16,013,503
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+ => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+ zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+ => 80M 6:36:55 [3.36k/s]
+
+ time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 80M 4:00:21 [5.55k/s]
+ => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+ => real 240m21.207s
+ => user 85m12.576s
+ => sys 3m29.580s
+
+ select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+ => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+ zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+ => 51,185,088
+
+ zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+ wc -l request_url.txt
+ => 50,002,674 request_url.txt
+
+ zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+ wc -l requires_oai.txt
+ => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ | 49491452
+ success | 1469113
+ no-capture | 134611
+ redirect-loop | 59666
+ no-pdf-link | 8947
+ cdx-error | 7561
+ terminal-bad-status | 6704
+ null-body | 5042
+ wrong-mimetype | 879
+ wayback-error | 722
+ petabox-error | 198
+ gateway-timeout | 86
+ link-loop | 51
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ spn2-error | 4
+ bad-gzip-encoding | 4
+ spn2-error:job-failed | 2
+ (18 rows)
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/oai_noingest_20200506.rows.json';
+ => COPY 49491452
+
+ WARNING: should have transformed from rows to requests here
+
+ cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Crawl and re-ingest
+
+Updated stats after ingest (NOTE: ingest requests not really formed correctly,
+but doesn't matter because fatcat wasn't importing these anyways):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 42565875
+ success | 5227609
+ no-pdf-link | 2156341
+ redirect-loop | 559721
+ cdx-error | 260446
+ wrong-mimetype | 148871
+ terminal-bad-status | 109725
+ link-loop | 92792
+ null-body | 30688
+ | 15287
+ petabox-error | 11109
+ wayback-error | 6261
+ skip-url-blocklist | 184
+ gateway-timeout | 86
+ bad-gzip-encoding | 25
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ bad-redirect | 15
+ spn2-error | 4
+ spn2-error:job-failed | 2
+ (20 rows)
+
+Dump again for crawling:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error')
+ ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json';
+
+Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains:
+
+ 4876135 www.kb.dk REMOVE: too large and generic
+ 3110009 kb-images.kb.dk REMOVE: dead?
+ 1274638 mdz-nbn-resolving.de REMOVE: maybe broken
+ 982312 aggr.ukm.um.si REMOVE: maybe broken
+
+And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so
+expecting at least 11,052,439 `no-capture` ingest results (and should probably
+filter for these or even delete from the ingest request table).
+
+Ingest progress:
+
+ 2020-08-05 14:02: 32,571,018
+ 2020-08-06 13:49: 31,195,169
+ 2020-08-07 10:11: 29,986,169
+ 2020-08-10 10:43: 26,497,196
+ 2020-08-12 11:02: 23,811,845
+ 2020-08-17 13:34: 19,460,502
+ 2020-08-20 09:49: 15,069,507
+ 2020-08-25 09:56: 9,397,035
+ 2020-09-02 15:02: 305,889 (72k longest queue)
+ 2020-09-03 14:30: done
+
+## Post-ingest stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 16804277
+ no-pdf-link | 14895249
+ success | 13898603
+ redirect-loop | 2709730
+ cdx-error | 827024
+ terminal-bad-status | 740037
+ wrong-mimetype | 604242
+ link-loop | 532553
+ null-body | 95721
+ wayback-error | 41864
+ petabox-error | 19204
+ | 15287
+ gateway-timeout | 510
+ bad-redirect | 318
+ skip-url-blocklist | 184
+ bad-gzip-encoding | 114
+ timeout | 78
+ spn2-cdx-lookup-failure | 59
+ invalid-host-resolution | 19
+ blocked-cookie | 6
+ (20 rows)
+
+Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth
+dumping the full kafka result topic, filter to OAI requests, and extracting the
+missing URLs.
+
+Top counts by OAI prefix:
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 25;
+
+ oai_prefix | success | total
+ --------------------------+---------+---------
+ kb.dk | 0 | 7989412 (excluded)
+ repec | 1118591 | 2783448
+ bnf.fr | 0 | 2187277
+ hispana.mcu.es | 19404 | 1492639
+ bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?)
+ hal | 564700 | 1049607
+ ukm.si | 0 | 982468 (excluded)
+ hsp.org | 0 | 810281
+ www.irgrid.ac.cn | 17578 | 748828
+ cds.cern.ch | 72811 | 688091
+ americanae.aecid.es | 69678 | 572792
+ biodiversitylibrary.org | 2121 | 566154
+ juser.fz-juelich.de | 22777 | 518551
+ espace.library.uq.edu.au | 6494 | 508960
+ igi.indrastra.com | 58689 | 478577
+ archive.ugent.be | 63654 | 424014
+ hrcak.srce.hr | 395031 | 414897
+ zir.nsk.hr | 153889 | 397200
+ renati.sunedu.gob.pe | 78399 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7963 | 354529
+ generic.eprints.org | 261221 | 340470
+ invenio.nusl.cz | 6184 | 325867
+ evastar-karlsruhe.de | 62044 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ (25 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ oai_prefix | status | count
+ --------------------------+---------------+---------
+ kb.dk | no-capture | 7955231 (excluded)
+ bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?)
+ repec | success | 1118591
+ hispana.mcu.es | no-pdf-link | 1118092
+ bnf.fr | no-capture | 1100591
+ ukm.si | no-capture | 976004 (excluded)
+ hsp.org | no-pdf-link | 773496
+ repec | no-pdf-link | 625629
+ bnf.fr | no-pdf-link | 607813
+ hal | success | 564700
+ biodiversitylibrary.org | no-pdf-link | 531409
+ cds.cern.ch | no-capture | 529842
+ repec | redirect-loop | 504393
+ juser.fz-juelich.de | no-pdf-link | 468813
+ bnf.fr | redirect-loop | 436087
+ americanae.aecid.es | no-pdf-link | 409954
+ hrcak.srce.hr | success | 395031
+ www.irgrid.ac.cn | no-pdf-link | 362087
+ hal | no-pdf-link | 352111
+ www.irgrid.ac.cn | no-capture | 346963
+ espace.library.uq.edu.au | no-pdf-link | 315302
+ igi.indrastra.com | no-pdf-link | 312087
+ repec | no-capture | 309882
+ invenio.nusl.cz | no-pdf-link | 302657
+ hypotheses.org | no-pdf-link | 298750
+ rour.neicon.ru | redirect-loop | 291922
+ renati.sunedu.gob.pe | no-capture | 276388
+ t2r2.star.titech.ac.jp | no-pdf-link | 264109
+ generic.eprints.org | success | 261221
+ quod.lib.umich.edu | no-pdf-link | 253937
+ (30 rows)
+
+If we remove excluded prefixes, and some large/generic prefixes (bnf.fr,
+hispana.mcu.es, hsp.org), then the aggregate counts are:
+
+ no-capture | 16,804,277 -> 5,502,242
+ no-pdf-link | 14,895,249 -> 12,395,848
+
+Top status by terminal domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ----------------------------------+---------------+--------
+ hispana.mcu.es | no-pdf-link | 709701 (national scope)
+ gallica.bnf.fr | no-pdf-link | 601193 (national scope)
+ discover.hsp.org | no-pdf-link | 524212 (historical)
+ www.biodiversitylibrary.org | no-pdf-link | 479288
+ gallica.bnf.fr | redirect-loop | 435981 (national scope)
+ hrcak.srce.hr | success | 389673
+ hemerotecadigital.bne.es | no-pdf-link | 359243
+ juser.fz-juelich.de | no-pdf-link | 345112
+ espace.library.uq.edu.au | no-pdf-link | 304299
+ invenio.nusl.cz | no-pdf-link | 302586
+ igi.indrastra.com | no-pdf-link | 292006
+ openrepository.ru | redirect-loop | 291555
+ hal.archives-ouvertes.fr | success | 278134
+ t2r2.star.titech.ac.jp | no-pdf-link | 263971
+ bib-pubdb1.desy.de | no-pdf-link | 254879
+ quod.lib.umich.edu | no-pdf-link | 250382
+ encounters.hsp.org | no-pdf-link | 248132
+ americanae.aecid.es | no-pdf-link | 245295
+ www.irgrid.ac.cn | no-pdf-link | 242496
+ publikationen.bibliothek.kit.edu | no-pdf-link | 222041
+ www.sciencedirect.com | no-pdf-link | 211756
+ dialnet.unirioja.es | redirect-loop | 203615
+ edoc.mpg.de | no-pdf-link | 195526
+ bibliotecadigital.jcyl.es | no-pdf-link | 184671
+ hal.archives-ouvertes.fr | no-pdf-link | 183809
+ www.sciencedirect.com | redirect-loop | 173439
+ lup.lub.lu.se | no-pdf-link | 165788
+ orbi.uliege.be | no-pdf-link | 158313
+ www.erudit.org | success | 155986
+ lib.dr.iastate.edu | success | 153384
+ (30 rows)
+
+Follow-ups are TBD but could include:
+- crawling the ~5m no-capture links directly (eg, not `base_url`) from the
+ ingest result JSON, while retaining the ingest request for later re-ingest
+- investigating and iterating on PDF link extraction, both for large platforms
+ and randomly sampled from long tail
+- classifying OAI prefixes by type (subject repository, institutional
+ repository, journal, national-library, historical docs, greylit, law, etc)
+- running pdftrio over some/all of this corpus
diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md
new file mode 100644
index 0000000..36d00a1
--- /dev/null
+++ b/notes/ingest/2020-05_pubmed.md
@@ -0,0 +1,10 @@
+
+From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1.
+
+Test small batch:
+
+ zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Run the whole batch:
+
+ zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md
new file mode 100644
index 0000000..1d33162
--- /dev/null
+++ b/notes/ingest/2020-07_mag.md
@@ -0,0 +1,353 @@
+
+Using 2020-06-25 upstream MAG corpus snapshot.
+
+Ran munging from `scratch:ingest/mag` notes first.
+
+Expecting a couple million new ingest request URLs; this is the first "patch"
+MAG ingest on top of existing already-run requests.
+
+Planning to skip the initial bulk ingest step, on the assumption that new URLs
+have either been ingested already (eg, via continuous ingest pipeline) or need
+crawling.
+
+## Generate Requests
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json
+ => 28.7M 2:36:48 [3.06k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json
+ => 5.66M 0:29:28 [ 3.2k/s]
+
+## Persist Ingest Requests
+
+ # small sample
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0})
+
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0})
+
+## Crawl/Dupe Status
+
+Overall status for old and new seeds, filtering out large (blocking)
+publishers:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 19477651
+ | 8238898
+ redirect-loop | 2036494
+ link-loop | 1330036
+ no-pdf-link | 1304820
+ terminal-bad-status | 648150
+ no-capture | 545785
+ gateway-timeout | 200143
+ cdx-error | 149995
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 57052
+ wayback-error | 41032
+ invalid-host-resolution | 37203
+ petabox-error | 11167
+ null-body | 6662
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 77
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ | 8238851
+ success | 787174
+ no-capture | 42864
+ redirect-loop | 31718
+ terminal-bad-status | 31493
+ no-pdf-link | 13025
+ cdx-error | 11275
+ wrong-mimetype | 6238
+ link-loop | 3365
+ wayback-error | 748
+ gateway-timeout | 506
+ null-body | 191
+ spn2-cdx-lookup-failure | 99
+ petabox-error | 89
+ invalid-host-resolution | 70
+ spn2-error | 7
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ bad-gzip-encoding | 1
+ (19 rows)
+
+Where are no-capture results terminating? May need to add or update heritrix
+crawl config so that we get better yield without needing to do SPNv2 crawling.
+
+ SELECT initial_domain, terminal_domain, COUNT(*)
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ GROUP BY initial_domain, terminal_domain
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ initial_domain | terminal_domain | count
+ ---------------------------------+---------------------+--------
+ www.researchgate.net | | 334145
+ academic.oup.com | | 205820
+ www.tandfonline.com | | 148638
+ journals.sagepub.com | | 144196
+ muse.jhu.edu | | 55957
+ hrcak.srce.hr | | 25317
+ www.omicsonline.org | | 22426
+ link.springer.com | | 21044
+ iopscience.iop.org | | 12385
+ bioone.org | | 9097
+ tandfonline.com | | 8512
+ or.nsfc.gov.cn | | 4823
+ ieeexplore.ieee.org | ieeexplore.ieee.org | 4398
+ pubs.acs.org | | 3708
+ archive-ouverte.unige.ch | | 2743
+ dergipark.ulakbim.gov.tr | | 2677
+ hal.archives-ouvertes.fr | | 1258
+ dergipark.org.tr | | 1207
+ apo.org.au | | 1186
+ spire.sciencespo.fr | | 989
+ cyberleninka.ru | | 895
+ lirias.kuleuven.be | | 855
+ tel.archives-ouvertes.fr | | 786
+ pub.uni-bielefeld.de | | 728
+ www.research-collection.ethz.ch | | 670
+ (25 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status IS NULL)
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json';
+ => 8784683
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json
+
+Seedlist transform from here on covered in MAG crawl notes.
+
+## Bulk Ingest
+
+Run ingest requests on everything we crawled:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Small sample:
+
+ head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Updated Overall Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24574294
+ redirect-loop | 2633731
+ no-capture | 2458694
+ no-pdf-link | 1896871
+ link-loop | 1510899
+ terminal-bad-status | 878821
+ cdx-error | 387574
+ gateway-timeout | 200246
+ | 170304
+ wayback-error | 97572
+ spn2-cdx-lookup-failure | 80284
+ wrong-mimetype | 65097
+ invalid-host-resolution | 37204
+ petabox-error | 12097
+ null-body | 8549
+ spn2-error | 1706
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ (20 rows)
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24557382
+ redirect-loop | 2630582
+ no-capture | 1947066
+ no-pdf-link | 1778206
+ link-loop | 1510790
+ terminal-bad-status | 857173
+ cdx-error | 384525
+ gateway-timeout | 200143
+ wayback-error | 96390
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 64908
+ invalid-host-resolution | 37203
+ petabox-error | 12087
+ null-body | 8548
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ | 69
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+---------
+ success | 5860601
+ no-capture | 1489959
+ redirect-loop | 619121
+ no-pdf-link | 473703
+ terminal-bad-status | 234753
+ cdx-error | 231575
+ link-loop | 184093
+ wayback-error | 56068
+ wrong-mimetype | 14046
+ null-body | 2068
+ petabox-error | 1006
+ gateway-timeout | 506
+ spn2-cdx-lookup-failure | 99
+ invalid-host-resolution | 70
+ | 22
+ bad-redirect | 13
+ spn2-error | 7
+ timeout | 3
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ (20 rows)
+
diff --git a/notes/ingest/2020-08_daily_improvements.md b/notes/ingest/2020-08_daily_improvements.md
new file mode 100644
index 0000000..da57065
--- /dev/null
+++ b/notes/ingest/2020-08_daily_improvements.md
@@ -0,0 +1,202 @@
+
+Goal is to increase rate of successful daily changelog crawling, but reduce
+wasted attempts.
+
+Status by domain, past 30 days:
+
+ domain | status | count
+ --------------------------------------+-----------------+-------
+ arxiv.org | success | 21792
+ zenodo.org | success | 10646
+ res.mdpi.com | success | 10449
+ springernature.figshare.com | no-pdf-link | 10430
+ s3-eu-west-1.amazonaws.com | success | 8966
+ zenodo.org | no-pdf-link | 8137
+ hkvalidate.perfdrive.com | no-pdf-link | 5943
+ www.ams.org:80 | no-pdf-link | 5799
+ assets.researchsquare.com | success | 4651
+ pdf.sciencedirectassets.com | success | 4145
+ fjfsdata01prod.blob.core.windows.net | success | 3500
+ sage.figshare.com | no-pdf-link | 3174
+ onlinelibrary.wiley.com | no-pdf-link | 2869
+ www.e-periodica.ch | no-pdf-link | 2709
+ revistas.uned.es | success | 2631
+ figshare.com | no-pdf-link | 2500
+ www.sciencedirect.com | link-loop | 2477
+ linkinghub.elsevier.com | gateway-timeout | 1878
+ downloads.hindawi.com | success | 1819
+ www.scielo.br | success | 1691
+ jps.library.utoronto.ca | success | 1590
+ www.ams.org | no-pdf-link | 1568
+ digi.ub.uni-heidelberg.de | no-pdf-link | 1496
+ research-repository.griffith.edu.au | success | 1412
+ journals.plos.org | success | 1330
+ (25 rows)
+
+Status by DOI prefix, past 30 days:
+
+ doi_prefix | status | count
+ ------------+-------------------------+-------
+ 10.6084 | no-pdf-link | 14410 <- figshare; small fraction success
+ 10.6084 | success | 4007
+ 10.6084 | cdx-error | 1746
+
+ 10.13140 | gateway-timeout | 9689 <- researchgate
+ 10.13140 | cdx-error | 4154
+
+ 10.5281 | success | 9408 <- zenodo
+ 10.5281 | no-pdf-link | 6079
+ 10.5281 | cdx-error | 3200
+ 10.5281 | wayback-error | 2098
+
+ 10.1090 | no-pdf-link | 7420 <- AMS (ams.org)
+
+ 10.3390 | success | 6599 <- MDPI
+ 10.3390 | cdx-error | 3032
+ 10.3390 | wayback-error | 1636
+
+ 10.1088 | no-pdf-link | 3227 <- IOP science
+
+ 10.1101 | gateway-timeout | 3168 <- coldspring harbor: press, biorxiv, medrxiv, etc
+ 10.1101 | cdx-error | 1147
+
+ 10.21203 | success | 3124 <- researchsquare
+ 10.21203 | cdx-error | 1181
+
+ 10.1016 | success | 3083 <- elsevier
+ 10.1016 | cdx-error | 2465
+ 10.1016 | gateway-timeout | 1682
+ 10.1016 | wayback-error | 1567
+
+ 10.25384 | no-pdf-link | 3058 <- sage figshare
+ 10.25384 | success | 2456
+
+ 10.1007 | gateway-timeout | 2913 <- springer
+ 10.1007 | cdx-error | 1164
+
+ 10.5944 | success | 2831
+ 10.1186 | success | 2650
+ 10.5169 | no-pdf-link | 2644 <- www.e-periodica.ch
+ 10.3389 | success | 2279
+ 10.24411 | gateway-timeout | 2184 <- cyberleninka.ru
+ 10.1038 | gateway-timeout | 2143 <- nature group
+ 10.1177 | gateway-timeout | 2038 <- SAGE
+ 10.11588 | no-pdf-link | 1574 <- journals.ub.uni-heidelberg.de (OJS?)
+ 10.25904 | success | 1416
+ 10.1155 | success | 1304
+ 10.21994 | no-pdf-link | 1268 <- loar.kb.dk
+ 10.18720 | spn2-cdx-lookup-failure | 1232 <- elib.spbstu.ru
+ 10.24411 | cdx-error | 1202
+ 10.1055 | no-pdf-link | 1170 <- thieme-connect.de
+ (40 rows)
+
+code changes for ingest:
+x hkvalidate.perfdrive.com: just bail when we see this
+x skip large publishers which gateway-timeout (for now)
+ - springerlink (10.1007)
+ - nature group (10.1038)
+ - SAGE (10.1177)
+ - IOP (10.1088)
+
+fatcat:
+x figshare (by `doi_prefix`): if not versioned (suffix), skip crawl
+x zenodo: also try to not crawl if unversioned (group)
+x figshare import metadata
+
+sandcrawler:
+x ends with `cookieAbsent` or `cookieSet=1` -> status as cookie-blocked
+x https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist
+x verify that we do quick-get for arxiv.org + europmc.org (+ figshare/zenodo?)
+ => we were not!
+x shorten post-SPNv2 CDX pause? for throughput, given that we are re-trying anyways
+x ensure that we store uncrawled URL somewhere on no-capture status
+ => in HTML or last of hops
+ => not in DB, but that is a bigger change
+
+- try to get un-blocked:
+ - coldspring harbor has been blocking since 2020-06-22? yikes!
+ - cyberleninka.ru
+ - arxiv.org
+
+- no-pdf-link
+ x www.ams.org (10.1090)
+ => these seem to be stale captures, eg from 2008. newer captures have citation_pdf_url
+ => should consider recrawling all of ams.org?
+ => not sure why these crawl requests are happening only now
+ => on the order of 15k OA articles not in ia; 43k total not preserved
+ => force recrawl OA subset (DONE)
+ x www.e-periodica.ch (10.5169)
+ => TODO: dump un-preserved URLs, transform to PDF urls, heritrix crawl, re-ingest
+ x digi.ub.uni-heidelberg.de (10.11588)
+ => TODO: bulk re-enqueue? then heritrix crawl?
+ - https://loar.kb.dk/handle/1902/6988 (10.21994)
+ => TODO: bulk re-enqueue
+ => site was updated recently (august 2020); now it crawls fine. need to re-ingest all?
+ => 7433 hits
+ - thieme-connect.de (10.1055)
+ => 600k+ missing
+ => TODO: bulk re-enqueue? then heritrix crawl?
+ => https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist
+ => generally just need to re-crawl all?
+
+Unresolved:
+- why so many spn2-errors on https://elib.spbstu.ru/ (10.18720)?
+
+## figshare
+
+10.6084 regular figshare
+10.25384 SAGE figshare
+
+For sage, "collections" are bogus? can we detect these in datacite metadata?
+
+If figshare types like:
+
+ ris: "GEN",
+ bibtex: "misc",
+ citeproc: "article",
+ schemaOrg: "Collection",
+ resourceType: "Collection",
+ resourceTypeGeneral: "Collection"
+
+then mark as 'stub'.
+
+"Additional file" items don't seem like "stub"; -> "component".
+
+title:"Figure {} from " -> component
+
+current types are mostly: article, stub, dataset, graphic, article-journal
+
+If DOI starts with "sage.", then publisher is "Sage" (not figshare). Container
+name should be... sage.figshare.com?
+
+set version to the version from DOI
+
+## zenodo
+
+doi_prefix: 10.5281
+
+if on zenodo, and has a "Identical to" relation, then this is a pre-print. in
+that case, drop container_id and set container_name to zenodo.org. *But*, there
+are some journals now publishing exclusively to zenodo.org, so retain that
+metadata. examples:
+
+ "Detection of keyboard vibrations and effects on perceived piano quality"
+ https://fatcat.wiki/release/mufzkdgt2nbzfha44o7p7gkrpy
+
+ "Editing LAF: Educate, don't defend!"
+ https://zenodo.org/record/2583025
+
+version number not available in zenodo metadata
+
+## Gitlab MR Notes
+
+The main goal of this group of changes is to do a better job at daily ingest.
+
+Currently we have on the order of 20k new releases added to the index every day, and about half of them get are marked as OA (either CC license or via container being in DOAJ or ROAD), and pass some filters (eg, release_type), and are selected for ingest. Of those, about half fail to crawl to fulltext, either due to blocking (gateway-timeout, cookie tests, anti-bot detection, loginwall, etc). On the other hand, we don't attempt to crawl lots of "bronze" OA, which is content that is available from the publisher website, but isn't marked explicitly OA.
+
+Based on investigating daily crawling from the past month (will commit these notes to sandcrawler soon), I have identified some DOI prefixes that almost always fail ingest via SPNv2. I also have some patches to sandcrawler ingest to improve ability to crawl some large repositories etc.
+
+Some of the biggest "OA but failed to crawl" are from figshare and zenodo, which register a relatively large fraction of daily OA DOIs. We want to crawl most of that content, but both of these platforms register at least DOIs for each piece of content (a "group" DOI and a "versioned" DOI), and we only need to crawl one. There were also some changes needed to release-type filtering and assignment specific to these platforms, or based on the title of entities.
+
+This MR mixes changes to the datacite metadata import routing (including some refactors out of the main parse_record method) and behavior changes to the entity updater (which is where the code to decide about whether to send an ingest request on release creation lives). I will have a separate MR for importer metadata changes that don't impact ingest behavior.
+
diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md
new file mode 100644
index 0000000..f5c853d
--- /dev/null
+++ b/notes/ingest/2020-09_oa_doi.md
@@ -0,0 +1,352 @@
+
+It seems that many gold OA DOIs on were not ingesting simply because the HTML
+url extraction was not working for a particular version of OJS.
+
+Let's re-try all ~2.5 million of these in bulk mode and see how many are
+'no-capture' vs. other errors, then possibly re-crawl a large number.
+
+## Bulk Ingest
+
+Dump ingest requests
+
+ ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json
+ Expecting 2569876 release objects in search queries
+ Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034})
+
+Enqueue
+
+ cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started at about:
+
+ Thu Sep 17 00:15:00 UTC 2020
+ 2020-09-17T00:15:00Z
+
+## Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND ingest_file_result.updated >= '2020-09-16'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -------------------------------------+--------
+ no-capture | 513462
+ success | 206042
+ no-pdf-link | 186779
+ terminal-bad-status | 40372
+ redirect-loop | 33103
+ cdx-error | 24078
+ link-loop | 13494
+ spn2-cdx-lookup-failure | 10247
+ gateway-timeout | 4407
+ wrong-mimetype | 3213
+ petabox-error | 866
+ null-body | 449
+ spn2-error | 217
+ wayback-error | 129
+ spn2-error:job-failed | 64
+ bad-redirect | 6
+ spn2-error:soft-time-limit-exceeded | 1
+ (17 rows)
+
+This was only about half the requests. Try... broader?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -------------------------------------+--------
+ no-capture | 579952
+ success | 387325
+ no-pdf-link | 380406
+ terminal-bad-status | 63743
+ redirect-loop | 53893
+ cdx-error | 46024
+ spn2-cdx-lookup-failure | 28347
+ link-loop | 22573
+ gateway-timeout | 11686
+ wrong-mimetype | 6294
+ null-body | 3509
+ petabox-error | 2388
+ spn2-error | 1023
+ spn2-error:job-failed | 462
+ wayback-error | 347
+ spn2-error:soft-time-limit-exceeded | 20
+ bad-redirect | 11
+ (17 rows)
+
+What top domains for those `no-pdf-link` (or similar)?
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 56488
+ figshare.com | no-pdf-link | 55337
+ www.egms.de | redirect-loop | 22686
+ zenodo.org | terminal-bad-status | 22128
+ tandf.figshare.com | no-pdf-link | 20027
+ springernature.figshare.com | no-pdf-link | 17181
+ cairn.info | terminal-bad-status | 13836
+ www.persee.fr | terminal-bad-status | 7565
+ projecteuclid.org | link-loop | 7449
+ www.cairn.info | no-pdf-link | 6992
+ scialert.net | no-pdf-link | 6621
+ www.cairn.info | link-loop | 5870
+ utpjournals.press | no-pdf-link | 5772
+ journals.openedition.org | redirect-loop | 5464
+ www.egms.de | no-pdf-link | 5223
+ archaeologydataservice.ac.uk | no-pdf-link | 4881
+ rs.figshare.com | no-pdf-link | 4773
+ www.degruyter.com | spn2-cdx-lookup-failure | 4763
+ koreascience.or.kr | no-pdf-link | 4487
+ cancerres.aacrjournals.org | no-pdf-link | 4124
+ cms.math.ca | no-pdf-link | 3441
+ volcano.si.edu | no-pdf-link | 3424
+ www.mathnet.ru | no-pdf-link | 3229
+ tidsskriftet.no | no-pdf-link | 3012
+ journals.plos.org | no-pdf-link | 3005
+ tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796
+ www.cairn.info:80 | link-loop | 2647
+ hammer.figshare.com | no-pdf-link | 2627
+ www.psychosocial.com | no-pdf-link | 2457
+ osf.io | terminal-bad-status | 2388
+ (30 rows)
+
+Should look at link extraction for:
+
+- scialert.net
+- utpjournals.press
+- koreascience.or.kr
+- cancerres.aacrjournals.org
+- cms.math.ca
+- volcano.si.edu
+- www.mathnet.ru
+- www.psychosocial.com
+
+## Re-Ingest
+
+Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ AND ingest_file_result.status = 'no-capture'
+ -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json';
+ => COPY 579952
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json
+ => 579k 0:00:22 [25.9k/s]
+
+ cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Resuming progress on this in early December 2020.
+
+Filtered requests to re-crawl:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20')
+ OR (ingest_file_result.updated >= '2020-10-11'))
+ AND ingest_file_result.status != 'success'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json';
+ => COPY 2352614
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+ wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt
+ 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+ 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+Top DOI prefixes (same old usual suspects):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20
+ 353695 10.5281 zenodo.org
+ 121888 10.6084 figshare.org
+ 115093 10.3917 cairn.info
+ 113252 10.3406 persee.fr
+ 95414 10.1515 degruyter.com
+ 90448 10.4324 taylorfrancis.com
+ 83927 10.1016 elsevier
+ 60303 10.1109 IEEE
+ 48490 10.4000 openedition.org
+ 28498 10.3205 egms.de
+ 23433 10.1163 brill.com
+ 23276 10.17615 cdr.lib.unc.edu
+ 21386 10.1093 oup.com
+ 20783 10.3138 utpjournals.press
+ 19987 10.1201 tandfonline.com
+ 17916 10.34847 cocoon.huma-num.fr
+ 16970 10.1002 wiley.com
+ 15958 10.1097 lww.com (and others?)
+ 15835 10.1017 cambridge.org
+ 15466 10.24355 publikationsserver.tu-braunschweig.de (IR)
+
+Top domains (not doi.org):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 104148 zenodo.org
+ 85245 www.persee.fr
+ 52931 www.cairn.info
+ 4791 www.jstage.jst.go.jp
+ 4411 archive.monthlyreview.org
+ 4129 osf.io
+ 2841 www.indianjournals.com
+ 2746 www.impan.pl
+ 2620 platform.almanhal.com
+ 2019 www.nomos-elibrary.de
+ 1209 dergipark.org.tr
+ 1027 pubs.geoscienceworld.org
+ 973 www.pdcnet.org
+ 923 www.hanspub.org
+ 914 www.repository.cam.ac.uk
+ 863 mediarep.org
+ 812 www.cartographicperspectives.org
+ 687 www.degruyter.com
+ 578 192.168.7.24
+ 566 journals.eco-vector.com
+
+TODO: infer `publisher_type` and platform from DOI prefix in more cases
+
+## Re-Ingest
+
+Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3
+million requests. Note these are all `pdf` requests, but crawl was done in an
+HTML-friendly way, so should be able to do domain/journal-specific HTML ingests
+in the future.
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Stats, for this ingest period (fuzzy; will have some daily ingest stuff):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-12-28'
+ AND ingest_request.created <= '2020-12-09'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -----------------------+--------
+ no-pdf-link | 962714
+ success | 539305
+ no-capture | 306590
+ redirect-loop | 192149
+ link-loop | 184797
+ terminal-bad-status | 141721
+ wrong-mimetype | 10362
+ null-body | 10277
+ skip-url-blocklist | 1985
+ wayback-content-error | 1300
+ cdx-error | 869
+ petabox-error | 160
+ bad-redirect | 72
+ wayback-error | 46
+ bad-gzip-encoding | 7
+ timeout | 1
+ max-hops-exceeded | 1
+ (17 rows)
+
diff --git a/notes/ingest/2020-09_reingest.md b/notes/ingest/2020-09_reingest.md
new file mode 100644
index 0000000..ec4e536
--- /dev/null
+++ b/notes/ingest/2020-09_reingest.md
@@ -0,0 +1,197 @@
+
+Goal: re-bulk-ingest some older existing crawls which hung on errors like
+`cdx-error` or `wayback-error`, indicating that ingest might actually succeed
+on retry.
+
+Sources:
+- unpaywall (again)
+- doi (ingest, changelog, etc)
+- mag
+- oai
+
+## DOI
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 8304582
+ success | 3461708
+ no-capture | 1881269
+ redirect-loop | 1851541
+ gateway-timeout | 355820
+ cdx-error | 341848
+ terminal-bad-status | 328650
+ skip-url-blocklist | 220474
+ spn2-cdx-lookup-failure | 125521
+ link-loop | 109352
+ wayback-error | 101525
+ null-body | 73539
+ wrong-mimetype | 53151
+ spn-error | 13579
+ spn2-error | 6848
+ spn2-error:job-failed | 4381
+ spn-remote-error | 4180
+ other-mimetype | 2305
+ petabox-error | 904
+ timeout | 710
+ spn2-error:soft-time-limit-exceeded | 557
+ spn2-error:proxy-error | 437
+ spn2-error:browser-running-error | 273
+ invalid-host-resolution | 233
+ pending | 116
+ (25 rows)
+
+Bulk:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_doi_errors_2020-09-03.rows.json';
+ => 443421
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+Additional 27,779 success status? Hard to tell because lots of other ingest
+running in parallel.
+
+Live:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_file_result.status = 'spn-error' OR
+ ingest_file_result.status = 'spn2-cdx-lookup-failure' OR
+ ingest_file_result.status = 'spn2-error:job-failed' OR
+ ingest_file_result.status = 'spn2-error:proxy-error'
+ )
+ ) TO '/grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json';
+ => 143984
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_doi_spn_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Unpaywall (again)
+
+Bulk:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json';
+ => 43912
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+## MAG
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_mag_errors_2020-09-03.rows.json';
+ => 188175
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_mag_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+## OAI-PMH
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_oai_errors_2020-09-03.rows.json';
+ => 851056
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_oai_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+---------
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+
diff --git a/notes/ingest/2020-09_scielo.md b/notes/ingest/2020-09_scielo.md
new file mode 100644
index 0000000..4ec6fbd
--- /dev/null
+++ b/notes/ingest/2020-09_scielo.md
@@ -0,0 +1,21 @@
+
+As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing
+fatcat releases with no IA copy and with `publisher_type:scielo`. There are
+200k+ such releases.
+
+It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008
+
+Could try XML ingest of these!
+
+## Bulk Ingest
+
+Dump ingest requests
+
+ ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json
+ Expecting 212529 release objects in search queries
+
+Enqueue
+
+ cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done 2020-09-14
+
diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md
new file mode 100644
index 0000000..d2bb50b
--- /dev/null
+++ b/notes/ingest/2020-10_daily.md
@@ -0,0 +1,193 @@
+
+Quick notes on how daily ingest is going, circa September/October 2020.
+
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
+ ORDER BY date(ingest_request.created) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+-------+---------
+ pdf | 2020-10-10 | 6145 | 1368
+ pdf | 2020-10-09 | 28453 | 6461
+ pdf | 2020-10-08 | 15105 | 3803
+ pdf | 2020-10-07 | 34213 | 10813
+ pdf | 2020-10-06 | 22263 | 8565
+ pdf | 2020-10-05 | 7910 | 3200
+ pdf | 2020-10-04 | 10865 | 4579
+ pdf | 2020-10-03 | 27745 | 10818
+ pdf | 2020-10-02 | 34320 | 13523
+ pdf | 2020-10-01 | 32548 | 13252
+ pdf | 2020-09-30 | 34798 | 14113
+ pdf | 2020-09-29 | 22463 | 8328
+ pdf | 2020-09-28 | 4117 | 1278
+ pdf | 2020-09-27 | 5894 | 1732
+ pdf | 2020-09-26 | 34949 | 13901
+ pdf | 2020-09-25 | 33680 | 10605
+ pdf | 2020-09-24 | 15125 | 5785
+ pdf | 2020-09-23 | 20866 | 6584
+ pdf | 2020-09-22 | 20949 | 7167
+ pdf | 2020-09-21 | 22483 | 7308
+ pdf | 2020-09-20 | 45644 | 16981
+ pdf | 2020-09-19 | 95571 | 31991
+ pdf | 2020-09-18 | 50849 | 15875
+ pdf | 2020-09-17 | 20121 | 3158
+ pdf | 2020-09-16 | 39184 | 12150
+ pdf | 2020-09-15 | 16986 | 7705
+ (26 rows)
+
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+--------
+ pdf | success | 241047
+ pdf | no-pdf-link | 143084
+ pdf | spn2-cdx-lookup-failure | 108311
+ pdf | gateway-timeout | 97250
+ pdf | cdx-error | 61820
+ pdf | link-loop | 31350
+ pdf | wayback-error | 9139
+ pdf | spn2-error:job-failed | 4240
+ pdf | spn2-error | 3893
+ pdf | wrong-mimetype | 1010
+ pdf | no-capture | 851
+ pdf | null-body | 605
+ pdf | redirect-loop | 261
+ pdf | spn2-error:soft-time-limit-exceeded | 126
+ pdf | terminal-bad-status | 120
+ pdf | petabox-error | 105
+ pdf | timeout | 29
+ pdf | spn2-error:no-status | 2
+ pdf | spn2-error:invalid-server-response | 2
+ pdf | bad-gzip-encoding | 1
+ (20 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 52767
+ www.degruyter.com | link-loop | 17666
+ www.degruyter.com | spn2-cdx-lookup-failure | 17597
+ ieeexplore.ieee.org | gateway-timeout | 15290
+ www.sciencedirect.com | no-pdf-link | 14043
+ apps.crossref.org | no-pdf-link | 11531
+ figshare.com | no-pdf-link | 8966
+ tandf.figshare.com | no-pdf-link | 7276
+ zenodo.org | no-capture | 7191
+ springernature.figshare.com | no-pdf-link | 6485
+ www.taylorfrancis.com | link-loop | 6266
+ www.persee.fr | terminal-bad-status | 6031
+ journals.openedition.org | gateway-timeout | 5639
+ www.cairn.info | link-loop | 5618
+ archaeologydataservice.ac.uk | no-pdf-link | 5359
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748
+ www.e-periodica.ch | no-pdf-link | 4722
+ osf.io | no-capture | 4247
+ cancerres.aacrjournals.org | no-pdf-link | 4136
+ dlc.library.columbia.edu | no-pdf-link | 4085
+ www.egms.de | no-pdf-link | 3304
+ journals.lww.com | no-pdf-link | 3218
+ journals.plos.org | no-pdf-link | 3005
+ linkinghub.elsevier.com | gateway-timeout | 2833
+ www.egms.de | redirect-loop | 2606
+ (25 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------------------+---------+-------
+ zenodo.org | success | 55549
+ arxiv.org | success | 24450
+ s3-eu-west-1.amazonaws.com | success | 18156
+ res.mdpi.com | success | 13493
+ www.degruyter.com | success | 12009
+ journals.openedition.org | success | 11235
+ www.jstage.jst.go.jp | success | 9460
+ peer.asee.org | success | 9416
+ www.e-periodica.ch | success | 8105
+ ir.canterbury.ac.nz | success | 6381
+ europepmc.org | success | 5670
+ www.repository.cam.ac.uk | success | 4858
+ assets.researchsquare.com | success | 4765
+ fjfsdata01prod.blob.core.windows.net | success | 4130
+ tidsskrift.dk | success | 3964
+ research-journal.org | success | 3127
+ ieeexplore.ieee.org | success | 2947
+ dergipark.org.tr | success | 2892
+ watermark.silverchair.com | success | 2315
+ journals.plos.org | success | 2304
+ journal.fi | success | 1996
+ publications.rwth-aachen.de | success | 1954
+ www.brazilianjournals.com | success | 1637
+ article.sciencepublishinggroup.com | success | 1589
+ revistas.upr.edu | success | 1467
+ (25 rows)
+
+Casual take-aways:
+- wonder what `apps.crossref.org` is
+- sciencedirect crawling broken?
+- figshare might be broken? or just very little success
+- seems like a lot of journals.plos.org failures
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md
new file mode 100644
index 0000000..a991025
--- /dev/null
+++ b/notes/ingest/2020-10_unpaywall.md
@@ -0,0 +1,286 @@
+
+New snapshot released 2020-10-09. Want to do a mostly straight-forward
+load/ingest/crawl.
+
+Proposed changes this time around:
+
+- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture`
+ status, and to include those URLs in heritrix3 crawl
+- tweak heritrix3 config for additional PDF URL extraction patterns,
+ particularly to improve OJS yield
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json
+ => 28.3M 3:19:03 [2.37k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 28.3M 1:11:29 [ 6.6k/s]
+ => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2020-10-09'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json';
+ => COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json
+ => 4.22M 0:02:48 [ 25k/s]
+
+Start small, to test no-capture behavior:
+
+ cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+`no-capture` change looks good. Enqueue the whole batch:
+
+ cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 23661282
+ no-capture | 3015447
+ no-pdf-link | 2302102
+ redirect-loop | 1542566
+ terminal-bad-status | 1044676
+ wrong-mimetype | 114315
+ link-loop | 36358
+ cdx-error | 20150
+ null-body | 14513
+ wayback-error | 13644
+ gateway-timeout | 3776
+ spn2-cdx-lookup-failure | 1260
+ petabox-error | 1171
+ redirects-exceeded | 752
+ invalid-host-resolution | 464
+ spn2-error | 147
+ bad-redirect | 131
+ spn2-error:job-failed | 91
+ wayback-content-error | 45
+ timeout | 19
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ ) t1
+ ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json';
+ => 2,936,404
+
+ # TODO: in the future also exclude "www.archive.org"
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt
+
+ wc -l unpaywall_seedlist_2020-11-02.*.txt
+ 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt
+ 2713866 unpaywall_seedlist_2020-11-02.url.txt
+
+With things like jsessionid, suspect that crawling just the terminal URLs is
+going to work better than both full and terminal.
+
+Finding a fraction of `no-capture` which have partial/stub URLs as terminal.
+
+TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP).
+
+
+## Bulk Ingest and Status
+
+Note, removing archive.org links:
+
+ cat /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json | rg -v www.archive.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Overall status (checked 2020-12-08):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 25004559
+ no-pdf-link | 2531841
+ redirect-loop | 1671375
+ terminal-bad-status | 1389463
+ no-capture | 893880
+ wrong-mimetype | 119332
+ link-loop | 66508
+ wayback-content-error | 30339
+ cdx-error | 21790
+ null-body | 20710
+ wayback-error | 13976
+ gateway-timeout | 3775
+ petabox-error | 2420
+ spn2-cdx-lookup-failure | 1218
+ redirects-exceeded | 889
+ invalid-host-resolution | 464
+ bad-redirect | 147
+ spn2-error | 112
+ spn2-error:job-failed | 91
+ timeout | 21
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1101090
+ accepted | no-pdf-link | 28590
+ accepted | redirect-loop | 10923
+ accepted | no-capture | 9540
+ accepted | terminal-bad-status | 6339
+ accepted | cdx-error | 952
+ accepted | wrong-mimetype | 447
+ accepted | link-loop | 275
+ accepted | wayback-error | 202
+ accepted | petabox-error | 177
+ accepted | redirects-exceeded | 122
+ accepted | null-body | 27
+ accepted | wayback-content-error | 14
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | bad-redirect | 1
+ published | success | 18595278
+ published | no-pdf-link | 2434935
+ published | redirect-loop | 1364110
+ published | terminal-bad-status | 1185328
+ published | no-capture | 718792
+ published | wrong-mimetype | 112923
+ published | link-loop | 63874
+ published | wayback-content-error | 30268
+ published | cdx-error | 17302
+ published | null-body | 15209
+ published | wayback-error | 10782
+ published | gateway-timeout | 1966
+ published | petabox-error | 1611
+ published | spn2-cdx-lookup-failure | 879
+ published | redirects-exceeded | 760
+ published | invalid-host-resolution | 453
+ published | bad-redirect | 115
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 75
+ published | timeout | 21
+ published | bad-gzip-encoding | 5
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | spn2-error:pending | 1
+ published | blocked-cookie | 1
+ published | | 1
+ published | pending | 1
+ submitted | success | 5308166
+ submitted | redirect-loop | 296322
+ submitted | terminal-bad-status | 197785
+ submitted | no-capture | 165545
+ submitted | no-pdf-link | 68274
+ submitted | wrong-mimetype | 5962
+ submitted | null-body | 5474
+ submitted | cdx-error | 3536
+ submitted | wayback-error | 2992
+ submitted | link-loop | 2359
+ submitted | gateway-timeout | 1805
+ submitted | petabox-error | 632
+ submitted | spn2-cdx-lookup-failure | 334
+ submitted | wayback-content-error | 57
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 31
+ submitted | spn2-error:job-failed | 14
+ submitted | | 12
+ submitted | invalid-host-resolution | 11
+ submitted | redirects-exceeded | 7
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | bad-gzip-encoding | 1
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 11
+ | no-capture | 3
+ (70 rows)
diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md
new file mode 100644
index 0000000..f9abe09
--- /dev/null
+++ b/notes/ingest/2020-11-04_arxiv.md
@@ -0,0 +1,12 @@
+
+Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run
+a crawl.
+
+Crawl is now done, so going to ingest, hoping to get the majority of the
+millions of remaining arxiv.org PDFs.
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l
+ => 1,288,559
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md
new file mode 100644
index 0000000..473dd0d
--- /dev/null
+++ b/notes/ingest/2020-11_doaj.md
@@ -0,0 +1,295 @@
+
+This is the first ingest (and crawl) of URLs from DOAJ article-level metadata.
+It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in
+the past.
+
+Working off a 2020-11-13 snapshot.
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+ => 6.7M 0:24:28 [4.57k/s]
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => ran in to error with blank `base_url`
+
+Second try after patches:
+
+ zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+ => 6.7M 0:24:29 [4.56k/s]
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036})
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-------------------------+---------
+ pdf | | 3711532
+ html | | 2429003
+ pdf | success | 454403
+ pdf | redirect-loop | 48587
+ pdf | no-pdf-link | 24901
+ pdf | no-capture | 11569
+ xml | | 9442
+ pdf | link-loop | 8466
+ pdf | terminal-bad-status | 2015
+ pdf | wrong-mimetype | 1441
+ pdf | null-body | 1057
+ pdf | petabox-error | 299
+ pdf | cdx-error | 124
+ pdf | gateway-timeout | 114
+ pdf | wayback-error | 77
+ pdf | spn2-cdx-lookup-failure | 20
+ pdf | invalid-host-resolution | 4
+ pdf | spn2-error | 1
+ (18 rows)
+
+## Dump new URLs, Transform, Bulk Ingest (PDF and XML only)
+
+Dump:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_request.link_source = 'doaj'
+ -- AND date(ingest_request.created) > '2020-12-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json';
+ => COPY 3732543
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json
+ => 3.73M 0:02:18 [26.9k/s]
+
+Definitely some non-URL strings in there; should try to filter those out
+earlier in the transform process. And/or have a constraint on the URL column in
+the database.
+
+Enqueue the whole batch:
+
+ cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started this batch off at 2020-11-19 18:10 (Pacific time)
+
+Stats after run:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 30;
+
+## Dump Seedlist
+
+After preliminary bulk ingest attempts, dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ AND (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_file_result.status != 'success'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json';
+ => 1,899,555
+
+TODO: filter for valid URLs
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt
+
+ wc -l doaj_seedlist_2020-11-19.*.txt
+
+## Post-Crawl Ingest
+
+Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ
+identifiers are all in fatcat:
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ # started 2020-12-23 15:05 (Pacific)
+ # finished around 2020-12-31, after one long/slow partition
+
+Stats again after everything:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ html | wrong-scope | 1089423
+ html | no-capture | 423917
+ html | redirect-loop | 212910
+ html | unknown-scope | 204069
+ html | html-resource-no-capture | 165587
+ html | success | 122937
+ html | null-body | 100296
+ html | wayback-content-error | 53918
+ html | wrong-mimetype | 18908
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13520
+ html | cdx-error | 6823
+ html | wayback-error | 890
+ html | | 620
+ html | blocked-cookie | 543
+ html | blocked-captcha | 250
+ html | redirects-exceeded | 135
+ html | too-many-resources | 111
+ html | max-hops-exceeded | 84
+ html | bad-redirect | 3
+ pdf | success | 2851324
+ pdf | no-pdf-link | 529914
+ pdf | redirect-loop | 349494
+ pdf | no-capture | 272202
+ pdf | null-body | 129027
+ pdf | terminal-bad-status | 91796
+ pdf | link-loop | 25267
+ pdf | wrong-mimetype | 6504
+ pdf | wayback-error | 2968
+ pdf | | 2068
+ pdf | wayback-content-error | 1548
+ pdf | cdx-error | 1095
+ pdf | petabox-error | 1024
+ pdf | bad-redirect | 203
+ pdf | redirects-exceeded | 135
+ pdf | timeout | 20
+ pdf | max-hops-exceeded | 19
+ pdf | bad-gzip-encoding | 2
+ xml | success | 6897
+ xml | null-body | 2353
+ xml | wrong-mimetype | 184
+ xml | no-capture | 5
+ xml | cdx-error | 3
+ (43 rows)
+
+
+And on filtered subset that we actually crawled:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ AND (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-----------------------+---------
+ pdf | success | 2851286
+ pdf | no-pdf-link | 527495
+ pdf | redirect-loop | 345138
+ pdf | no-capture | 268140
+ pdf | null-body | 129027
+ pdf | terminal-bad-status | 91125
+ pdf | link-loop | 25267
+ pdf | wrong-mimetype | 6504
+ pdf | wayback-error | 2907
+ pdf | petabox-error | 363
+ pdf | wayback-content-error | 242
+ pdf | bad-redirect | 203
+ pdf | redirects-exceeded | 135
+ pdf | max-hops-exceeded | 19
+ pdf | cdx-error | 15
+ pdf | bad-gzip-encoding | 2
+ xml | success | 6897
+ xml | null-body | 2353
+ xml | wrong-mimetype | 184
+ xml | no-capture | 5
+ (20 rows)
+
diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md
new file mode 100644
index 0000000..5979753
--- /dev/null
+++ b/notes/ingest/2020-12-08_patch_crawl_notes.md
@@ -0,0 +1,111 @@
+
+Notes here about re-ingesting or re-crawling large batches. Goal around end of
+2020 is to generate a broad patch crawl of terminal no-capture attempts for all
+major sources crawled thus far. Have already tried run this process for unpaywall.
+
+For each, want filtered ingest request JSON objects (filtering out platforms
+that don't crawl well, and possibly things like figshare+zenodo), and a broader
+seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a
+heritrix crawl with new config, then re-ingest all the requests individually.
+
+Summary of what to do here:
+
+ OA DOI: expecting some 2.4 million seeds
+ OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found
+ Unpaywall: another ~900k no-capture URLs (maybe filtered?)
+
+For all, re-attempt for these status codes:
+
+ no-capture
+ cdx-error
+ wayback-error
+ petabox-error
+ gateway-timeout (?)
+
+And at least do bulk re-ingest for these, if updated before 2020-11-20 or so:
+
+ no-pdf-link
+
+## OAI-PMH
+
+Need to re-ingest all of the (many!) no-capture and no-pdf-link
+
+TODO: repec-specific URL extraction?
+
+Skip these OAI prefixes:
+
+ kb.dk
+ bnf.fr
+ hispana.mcu.es
+ bdr.oai.bsb-muenchen.de
+ ukm.si
+ hsp.org
+
+Skip these domains:
+
+ www.kb.dk (kb.dk)
+ kb-images.kb.dk (kb.dk)
+ mdz-nbn-resolving.de (TODO: what prefix?)
+ aggr.ukm.um.si (ukm.si)
+
+Check PDF link extraction for these prefixes, or skip them (TODO):
+
+ repec (mixed success)
+ biodiversitylibrary.org
+ juser.fz-juelich.de
+ americanae.aecid.es
+ www.irgrid.ac.cn
+ hal
+ espace.library.uq.edu.au
+ igi.indrastra.com
+ invenio.nusl.cz
+ hypotheses.org
+ t2r2.star.titech.ac.jp
+ quod.lib.umich.edu
+
+ domain: hemerotecadigital.bne.es
+ domain: bib-pubdb1.desy.de
+ domain: publikationen.bibliothek.kit.edu
+ domain: edoc.mpg.de
+ domain: bibliotecadigital.jcyl.es
+ domain: lup.lub.lu.se
+ domain: orbi.uliege.be
+
+TODO:
+- consider deleting ingest requests from skipped prefixes (large database use)
+
+
+## Unpaywall
+
+About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`.
+
+Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) < '2020-11-20'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json';
+ => COPY 1309990
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json
+ => 1.31M 0:00:51 [25.6k/s]
+
+ cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md
new file mode 100644
index 0000000..d7643f4
--- /dev/null
+++ b/notes/ingest/2021-04_unpaywall.md
@@ -0,0 +1,368 @@
+
+New snapshot released 2021-02-18, finally getting around to a crawl two months
+later.
+
+Intend to do same style of crawl as in the past. One change is that
+sandcrawler-db has moved to a focal VM.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json
+ => 30.0M 3:14:59 [2.57k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json';
+ => COPY 3277484
+
+ # previous, 2020-10 run: COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json
+ => 3.28M 0:01:42 [32.1k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 26385866
+ no-pdf-link | 2132565
+ no-capture | 2092111
+ redirect-loop | 1732543
+ terminal-bad-status | 1504555
+ wayback-content-error | 357345
+ wrong-mimetype | 126070
+ link-loop | 76808
+ cdx-error | 22756
+ null-body | 22066
+ wayback-error | 13768
+ gateway-timeout | 3804
+ petabox-error | 3608
+ spn2-cdx-lookup-failure | 1225
+ redirects-exceeded | 892
+ invalid-host-resolution | 505
+ bad-redirect | 151
+ spn2-error | 108
+ spn2-error:job-failed | 91
+ bad-gzip-encoding | 27
+ (20 rows)
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-01-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 1348623
+ no-capture | 1231582
+ redirect-loop | 45622
+ no-pdf-link | 37312
+ terminal-bad-status | 24162
+ wrong-mimetype | 6684
+ link-loop | 5757
+ null-body | 1288
+ wayback-content-error | 1123
+ cdx-error | 831
+ petabox-error | 697
+ wayback-error | 185
+ invalid-host-resolution | 41
+ gateway-timeout | 29
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ spn2-cdx-lookup-failure | 7
+ bad-redirect | 4
+ timeout | 3
+ redirects-exceeded | 3
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json';
+ => 2020-10: 2,936,404
+ => 2021-04: 1,805,192
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json
+ => 1.81M 0:01:27 [20.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt
+ 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+ 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1,804,211 consumer group lag
+
+## Post-Ingest Stats
+
+Overall status (unpaywall, all time):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 27242251
+ no-pdf-link | 2746237
+ redirect-loop | 1821132
+ terminal-bad-status | 1553441
+ no-capture | 478559
+ wayback-content-error | 357390
+ wrong-mimetype | 127365
+ link-loop | 79389
+ cdx-error | 23170
+ null-body | 23169
+ wayback-error | 13704
+ gateway-timeout | 3803
+ petabox-error | 3642
+ redirects-exceeded | 1427
+ spn2-cdx-lookup-failure | 1214
+ invalid-host-resolution | 505
+ bad-redirect | 153
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ body-too-large | 84
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1213335
+ accepted | no-pdf-link | 29292
+ accepted | redirect-loop | 12769
+ accepted | terminal-bad-status | 11264
+ accepted | no-capture | 10187
+ accepted | cdx-error | 1015
+ accepted | wayback-content-error | 757
+ accepted | wrong-mimetype | 501
+ accepted | link-loop | 407
+ accepted | wayback-error | 207
+ accepted | petabox-error | 189
+ accepted | redirects-exceeded | 125
+ accepted | null-body | 34
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | blocked-cookie | 2
+ accepted | bad-redirect | 1
+ accepted | body-too-large | 1
+ published | success | 20196774
+ published | no-pdf-link | 2647969
+ published | redirect-loop | 1477558
+ published | terminal-bad-status | 1320013
+ published | wayback-content-error | 351931
+ published | no-capture | 297603
+ published | wrong-mimetype | 115440
+ published | link-loop | 76431
+ published | cdx-error | 18125
+ published | null-body | 17559
+ published | wayback-error | 10466
+ published | petabox-error | 2684
+ published | gateway-timeout | 1979
+ published | redirects-exceeded | 947
+ published | spn2-cdx-lookup-failure | 877
+ published | invalid-host-resolution | 457
+ published | bad-redirect | 120
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 70
+ published | body-too-large | 39
+ published | bad-gzip-encoding | 24
+ published | timeout | 24
+ published | blocked-cookie | 23
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | | 2
+ published | pending | 1
+ published | spn2-error:pending | 1
+ published | too-many-redirects | 1
+ submitted | success | 5832117
+ submitted | redirect-loop | 330785
+ submitted | terminal-bad-status | 222152
+ submitted | no-capture | 170766
+ submitted | no-pdf-link | 68934
+ submitted | wrong-mimetype | 11424
+ submitted | null-body | 5576
+ submitted | wayback-content-error | 4702
+ submitted | cdx-error | 4030
+ submitted | wayback-error | 3031
+ submitted | link-loop | 2551
+ submitted | gateway-timeout | 1820
+ submitted | petabox-error | 769
+ submitted | redirects-exceeded | 355
+ submitted | spn2-cdx-lookup-failure | 332
+ submitted | invalid-host-resolution | 48
+ submitted | body-too-large | 44
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 32
+ submitted | spn2-error:job-failed | 14
+ submitted | | 13
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | timeout | 4
+ submitted | bad-gzip-encoding | 3
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 12
+ | no-capture | 3
+ (76 rows)
+
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2192376
+ no-capture | 152183
+ no-pdf-link | 144174
+ redirect-loop | 125988
+ terminal-bad-status | 67307
+ link-loop | 8292
+ wrong-mimetype | 7942
+ null-body | 2270
+ cdx-error | 1223
+ wayback-content-error | 1147
+ petabox-error | 728
+ wayback-error | 155
+ body-too-large | 82
+ invalid-host-resolution | 41
+ gateway-timeout | 28
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ timeout | 7
+ bad-redirect | 6
+ redirects-exceeded | 4
+ (20 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 2,703,999 raw ingest requests (new URLs total)
+- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet
+- 843,753 (31.2%) success from new heritrix crawling
+- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md
new file mode 100644
index 0000000..e8748fa
--- /dev/null
+++ b/notes/ingest/2021-05_daily_improvements.md
@@ -0,0 +1,480 @@
+
+Summary of top large broken domains (2021-04-21 "30 day" snapshot):
+
+## acervus.unicamp.br
+
+ domain | status | count
+---------------------------------------+-------------------------+--------
+ acervus.unicamp.br | | 1967
+ acervus.unicamp.br | no-pdf-link | 1853
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5;
+
+http://acervus.unicamp.br/index.asp?codigo_sophia=963332
+
+seems like many of these were captures with a blank page? or a redirect to
+the homepage?
+
+http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html
+
+messy, going to move on.
+
+
+## apex.ipk-gatersleben.de
+
+apex.ipk-gatersleben.de | | 1253
+apex.ipk-gatersleben.de | no-pdf-link | 1132
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5;
+
+https://doi.org/10.25642/ipk/rescoll/4886
+https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331
+
+seem to be datasets/species, not articles.
+
+prefix: 10.25642/ipk
+
+## crossref.org
+
+ apps.crossref.org | | 4693
+ apps.crossref.org | no-pdf-link | 4075
+
+https://doi.org/10.1515/9781501747045-013
+https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013
+
+Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML.
+
+## openeditiong
+
+ books.openedition.org | | 1784
+ books.openedition.org | no-pdf-link | 1466
+
+https://doi.org/10.4000/books.pul.34492
+https://books.openedition.org/pul/34492
+
+these are not actually OA books (or at least, not all are)
+
+## chemrxiv.org (figshare)
+
+ chemrxiv.org | | 857
+ chemrxiv.org | no-pdf-link | 519
+
+https://doi.org/10.26434/chemrxiv.14411081
+https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081
+
+these all seem to be *multi-file* entities, thus not good for single file ingest pipeline.
+
+## direct.mit.edu
+
+ direct.mit.edu | | 996
+ direct.mit.edu | no-pdf-link | 869
+
+https://doi.org/10.7551/mitpress/14056.003.0004
+https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext
+
+"not available"
+
+https://doi.org/10.7551/mitpress/12444.003.0004
+
+"not available"
+
+
+## dlc.library.columbia.edu
+
+ dlc.library.columbia.edu | | 4225
+ dlc.library.columbia.edu | no-pdf-link | 2395
+ dlc.library.columbia.edu | spn2-wayback-error | 1568
+
+https://doi.org/10.7916/d8-506w-kk49
+https://dlc.library.columbia.edu/durst/cul:18931zcrk9
+
+document repository.
+this one goes to IA! actually many seem to.
+added extractor, should re-ingest with:
+
+ publisher:"Columbia University" doi_prefix:10.7916 !journal:*
+
+actually, that is like 600k+ results and many are not digitized, so perhaps not.
+
+## doi.ala.org.au
+
+ doi.ala.org.au | | 2570
+ doi.ala.org.au | no-pdf-link | 2153
+
+https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052
+https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052
+
+this is a data repository, with filesets, not papers. datacite metadata is
+incorrect.
+
+## fldeploc.dep.state.fl.us
+
+ fldeploc.dep.state.fl.us | | 774
+ fldeploc.dep.state.fl.us | no-pdf-link | 718
+
+
+https://doi.org/10.35256/ic29
+http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29
+
+re-ingest with:
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+## geoscan.nrcan.gc.ca
+
+ geoscan.nrcan.gc.ca | | 2056
+ geoscan.nrcan.gc.ca | no-pdf-link | 2019
+
+https://doi.org/10.4095/295366
+https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366
+
+this is a geographic repository, not papers.
+
+## kiss.kstudy.com
+
+ kiss.kstudy.com | | 747
+ kiss.kstudy.com | no-pdf-link | 686
+
+https://doi.org/10.22143/hss21.12.1.121
+http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523
+
+Korean. seems to not actually be theses? can't download.
+
+## linkinghub.elsevier.com
+
+ linkinghub.elsevier.com | | 5079
+ linkinghub.elsevier.com | forbidden | 2226
+ linkinghub.elsevier.com | spn2-wayback-error | 1625
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758
+
+skipping for now, looks like mostly 'forbidden'?
+
+## osf.io
+
+These are important!
+
+ osf.io | | 3139
+ osf.io | not-found | 2288
+ osf.io | spn2-wayback-error | 582
+
+https://doi.org/10.31219/osf.io/jux3w
+https://accounts.osf.io/login?service=https://osf.io/jux3w/download
+
+many of these are 404s by browser as well. what does that mean?
+
+## peerj.com
+
+ peerj.com | | 785
+ peerj.com | no-pdf-link | 552
+
+https://doi.org/10.7287/peerj.11155v0.1/reviews/2
+https://peerj.com/articles/11155/reviews/
+
+these are HTML reviews, not papers
+
+## preprints.jmir.org
+
+ preprints.jmir.org | | 763
+ preprints.jmir.org | no-pdf-link | 611
+
+https://doi.org/10.2196/preprints.22556
+https://preprints.jmir.org/preprint/22556
+
+UGH, looks simple, but javascript.
+
+could try to re-write URL into S3 format? meh.
+
+## psyarxiv.com (OSF?)
+
+ psyarxiv.com | | 641
+ psyarxiv.com | no-pdf-link | 546
+
+https://doi.org/10.31234/osf.io/5jaqg
+https://psyarxiv.com/5jaqg/
+
+Also infuriatingly Javascript, but can do URL hack.
+
+Should reingest, and potentially force-recrawl:
+
+ # about 67k
+ publisher:"Center for Open Science" in_ia:false
+
+## publons.com
+
+ publons.com | | 6998
+ publons.com | no-pdf-link | 6982
+
+https://doi.org/10.1002/jmor.21338/v2/review1
+https://publons.com/publon/40260824/
+
+These are just HTML reviews, not papers.
+
+## saemobilus.sae.org
+
+ saemobilus.sae.org | | 795
+ saemobilus.sae.org | no-pdf-link | 669
+
+https://doi.org/10.4271/as1426c
+https://saemobilus.sae.org/content/as1426c
+
+These seem to be standards, and are not open access (paywall)
+
+## scholar.dkyobobook.co.kr
+
+ scholar.dkyobobook.co.kr | | 1043
+ scholar.dkyobobook.co.kr | no-pdf-link | 915
+
+https://doi.org/10.22471/crisis.2021.6.1.18
+http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536
+
+Korean. complex javascript, skipping.
+
+## unreserved.rba.gov.au
+
+ unreserved.rba.gov.au | | 823
+ unreserved.rba.gov.au | no-pdf-link | 821
+
+https://doi.org/10.47688/rba_archives_2006/04129
+https://unreserved.rba.gov.au/users/login
+
+Don't need to login when I tried in browser? document repo, not papers.
+
+## wayf.switch.ch
+
+ wayf.switch.ch | | 1169
+ wayf.switch.ch | no-pdf-link | 809
+
+https://doi.org/10.24451/arbor.11128
+https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022
+
+Loginwall
+
+##
+
+ www.bloomsburycollections.com | | 1745
+ www.bloomsburycollections.com | no-pdf-link | 1571
+
+https://doi.org/10.5040/9781849664264.0008
+https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries
+
+These are primarily not OA/available.
+
+##
+
+ www.emc2020.eu | | 791
+ www.emc2020.eu | no-pdf-link | 748
+
+https://doi.org/10.22443/rms.emc2020.146
+https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html
+
+These are just abstracts, not papers.
+
+## Emerald
+
+ www.emerald.com | | 2420
+ www.emerald.com | no-pdf-link | 1986
+
+https://doi.org/10.1108/ramj-11-2020-0065
+https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html
+
+Note that these URLs are already HTML fulltext. but the PDF is also available and easy.
+
+re-ingest:
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+##
+
+ www.humankineticslibrary.com | | 1122
+ www.humankineticslibrary.com | no-pdf-link | 985
+
+https://doi.org/10.5040/9781718206625.ch-002
+https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2
+
+paywall
+
+##
+
+ www.inderscience.com | | 1532
+ www.inderscience.com | no-pdf-link | 1217
+
+https://doi.org/10.1504/ijdmb.2020.10036342
+https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb
+
+paywall
+
+##
+
+ www.ingentaconnect.com | | 885
+ www.ingentaconnect.com | no-pdf-link | 783
+
+https://doi.org/10.15258/sst.2021.49.1.07
+https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02
+
+Annoying javascript, but easy to work around.
+
+re-ingest:
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+##
+
+ www.nomos-elibrary.de | | 2235
+ www.nomos-elibrary.de | no-pdf-link | 1128
+ www.nomos-elibrary.de | spn2-wayback-error | 559
+
+https://doi.org/10.5771/9783748907084-439
+https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren
+
+Javascript obfuscated download button?
+
+##
+
+ www.oecd-ilibrary.org | | 3046
+ www.oecd-ilibrary.org | no-pdf-link | 2869
+
+https://doi.org/10.1787/543e84ed-en
+https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en
+
+Paywall.
+
+##
+
+ www.osapublishing.org | | 821
+ www.osapublishing.org | no-pdf-link | 615
+
+https://doi.org/10.1364/boe.422199
+https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199
+
+Some of these are "pre-registered" DOIs, not published yet. Many of the
+remaining are actually HTML articles, and/or have some stuff in the
+`citation_pdf_url`. A core problem is captchas.
+
+Have started adding support to fatcat for HTML crawl type based on container.
+
+re-ingest:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+##
+
+ www.oxfordscholarlyeditions.com | | 759
+ www.oxfordscholarlyeditions.com | no-pdf-link | 719
+
+https://doi.org/10.1093/oseo/instance.00266789
+https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27
+
+loginwall/paywall
+
+##
+
+ www.schweizerbart.de | | 730
+ www.schweizerbart.de | no-pdf-link | 653
+
+https://doi.org/10.1127/zfg/40/1996/461
+https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref
+
+paywall
+
+##
+
+ www.sciencedirect.com | | 14757
+ www.sciencedirect.com | no-pdf-link | 12733
+ www.sciencedirect.com | spn2-wayback-error | 1503
+
+https://doi.org/10.1016/j.landurbplan.2021.104104
+https://www.sciencedirect.com/science/article/pii/S0169204621000670
+
+Bunch of crazy new hacks, but seems to be working!
+
+re-ingest:
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2021
+
+##
+
+ www.sciendo.com | | 1955
+ www.sciendo.com | no-pdf-link | 1176
+
+https://doi.org/10.2478/awutm-2019-0012
+https://www.sciendo.com/article/10.2478/awutm-2019-0012
+
+uses lots of javascript, hard to scrape.
+
+
+## Others (for reference)
+
+ | | 725990
+ | no-pdf-link | 209933
+ | success | 206134
+ | spn2-wayback-error | 127015
+ | spn2-cdx-lookup-failure | 53384
+ | blocked-cookie | 35867
+ | link-loop | 25834
+ | too-many-redirects | 16430
+ | redirect-loop | 14648
+ | forbidden | 13794
+ | terminal-bad-status | 8055
+ | not-found | 6399
+ | remote-server-error | 2402
+ | wrong-mimetype | 2011
+ | spn2-error:unauthorized | 912
+ | bad-redirect | 555
+ | read-timeout | 530
+
+## Re-ingests
+
+All the above combined:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u
+ => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie
+ => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864})
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida"
+ => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843})
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald"
+ => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812})
+
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018"
+ => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140})
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2020
+ doi_prefix:10.1016 is_oa:true year:2021
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020"
+ => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021"
+ => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824})
+
+ pmcid:* year:2018
+ pmcid:* year:2019
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018"
+ => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019"
+ => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658})
+
diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md
new file mode 100644
index 0000000..8b6ac09
--- /dev/null
+++ b/notes/ingest/2021-07_unpaywall.md
@@ -0,0 +1,320 @@
+
+New snapshot released 2021-07-02. Should be "boring" ingest and crawl.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json
+ => 32.2M 3:01:52 [2.95k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260})
+
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+ => COPY 3556146
+
+ # previous, 2020-10 run: COPY 4216339
+ # previous, 2021-07 run: COPY 3277484
+
+Oops, should have run instead, with the date filter:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+
+But didn't, so processed all instead.
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json
+ => 3.56M 0:01:59 [29.8k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done, on 2021-07-13
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 1831827
+ success | 1343604
+ redirect-loop | 103999
+ terminal-bad-status | 19845
+ no-pdf-link | 17448
+ link-loop | 5027
+ wrong-mimetype | 2270
+ cdx-error | 523
+ body-too-large | 321
+ null-body | 298
+ wayback-content-error | 242
+ petabox-error | 155
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ wayback-error | 109
+ blocked-cookie | 9
+ timeout | 7
+ | 3
+ bad-redirect | 3
+ spn2-cdx-lookup-failure | 3
+ (20 rows)
+
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json';
+ => COPY 1743186
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json
+ => 1.74M 0:01:33 [18.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt
+ 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+ 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ 3287992 total
+
+Then run crawl (see `journal-crawls` git repo).
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1.74M 0:01:59 [14.6k/s]
+
+## Post-Ingest Stats
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2690258
+ redirect-loop | 227328
+ no-capture | 157368
+ terminal-bad-status | 118943
+ no-pdf-link | 92698
+ blocked-cookie | 19478
+ link-loop | 9249
+ wrong-mimetype | 4918
+ cdx-error | 1786
+ wayback-error | 1497
+ null-body | 1302
+ body-too-large | 433
+ wayback-content-error | 245
+ petabox-error | 171
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ timeout | 12
+ bad-redirect | 4
+ | 3
+ spn2-cdx-lookup-failure | 1
+ (20 rows)
+
+Only the recent updates, by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------+---------
+ accepted | success | 103144
+ accepted | no-pdf-link | 53981
+ accepted | terminal-bad-status | 4102
+ accepted | link-loop | 2799
+ accepted | no-capture | 2315
+ accepted | redirect-loop | 2171
+ accepted | blocked-cookie | 234
+ accepted | cdx-error | 140
+ accepted | wayback-error | 101
+ accepted | wrong-mimetype | 38
+ accepted | null-body | 10
+ accepted | petabox-error | 5
+ accepted | wayback-content-error | 4
+ accepted | gateway-timeout | 2
+ accepted | body-too-large | 2
+ published | success | 1919100
+ published | no-capture | 130104
+ published | redirect-loop | 127482
+ published | terminal-bad-status | 43118
+ published | no-pdf-link | 33505
+ published | blocked-cookie | 19034
+ published | link-loop | 6241
+ published | wrong-mimetype | 4163
+ published | null-body | 1195
+ published | cdx-error | 1151
+ published | wayback-error | 1105
+ published | wayback-content-error | 197
+ published | body-too-large | 195
+ published | petabox-error | 118
+ published | gateway-timeout | 35
+ published | invalid-host-resolution | 13
+ published | timeout | 8
+ published | bad-redirect | 2
+ published | spn2-cdx-lookup-failure | 1
+ published | bad-gzip-encoding | 1
+ submitted | success | 668014
+ submitted | redirect-loop | 97675
+ submitted | terminal-bad-status | 71723
+ submitted | no-capture | 24949
+ submitted | no-pdf-link | 5212
+ submitted | wrong-mimetype | 717
+ submitted | cdx-error | 495
+ submitted | wayback-error | 291
+ submitted | body-too-large | 236
+ submitted | blocked-cookie | 210
+ submitted | link-loop | 209
+ submitted | invalid-host-resolution | 107
+ submitted | gateway-timeout | 101
+ submitted | null-body | 97
+ submitted | petabox-error | 48
+ submitted | wayback-content-error | 44
+ submitted | timeout | 4
+ submitted | | 3
+ submitted | bad-redirect | 2
+ submitted | remote-server-error | 1
+ (55 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 3,325,954 raw ingest requests (new URLs total)
+- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl
+- 1,346,654 (77% of crawled) success from new heritrix crawling
+- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
+
+## Live Ingest Follow-Up
+
+Will run SPN requests on the ~160k `no-capture` URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json';
+ => COPY 157371
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json
+ => 157k 0:00:04 [31.6k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md
new file mode 100644
index 0000000..5f92196
--- /dev/null
+++ b/notes/ingest/2021-08_mag.md
@@ -0,0 +1,400 @@
+
+Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest.
+Also want to re-ingest some old/failed ingests, now that pipeline/code has
+improved.
+
+Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs.
+
+
+## Persist Ingest Requests
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000})
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 22.5M 0:46:00 [8.16k/s]
+ => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585})
+
+Roughly 8.6 million new URLs
+
+## Pre-Crawl Status Counts
+
+Status of combined old and new requests, with some large domains removed:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ success | 26123975
+ | 6664846
+ no-pdf-link | 1859908
+ redirect-loop | 1532405
+ no-capture | 1199126
+ link-loop | 1157010
+ terminal-bad-status | 832362
+ gateway-timeout | 202158
+ spn2-cdx-lookup-failure | 81406
+ wrong-mimetype | 69087
+ invalid-host-resolution | 37262
+ wayback-error | 21340
+ petabox-error | 11237
+ null-body | 9414
+ wayback-content-error | 2199
+ cdx-error | 1893
+ spn2-error | 1741
+ spn2-error:job-failed | 971
+ blocked-cookie | 902
+ spn2-error:invalid-url-syntax | 336
+ (20 rows)
+
+And just the new URLs (note that domain filter shouldn't be required, but
+keeping for consistency):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ | 6664780
+ success | 1957844
+ redirect-loop | 23357
+ terminal-bad-status | 9385
+ no-pdf-link | 8315
+ no-capture | 6892
+ link-loop | 4517
+ wrong-mimetype | 3864
+ cdx-error | 1749
+ blocked-cookie | 842
+ null-body | 747
+ wayback-error | 688
+ wayback-content-error | 570
+ gateway-timeout | 367
+ petabox-error | 340
+ spn2-cdx-lookup-failure | 150
+ read-timeout | 122
+ not-found | 119
+ invalid-host-resolution | 63
+ spn2-error | 23
+ (20 rows)
+
+## Dump Initial Bulk Ingest Requests
+
+Note that this is all-time, not just recent, and will re-process a lot of
+"no-pdf-link":
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-pdf-link'
+ OR ingest_file_result.status = 'cdx-error'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json';
+ => COPY 8526647
+
+Transform to ingest requests:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json
+ => 8.53M 0:03:40
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+Updated stats after running initial bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 5184994
+ no-capture | 3284416
+ redirect-loop | 98685
+ terminal-bad-status | 28733
+ link-loop | 28518
+ blocked-cookie | 22338
+ no-pdf-link | 19073
+ wrong-mimetype | 9122
+ null-body | 2793
+ wayback-error | 2128
+ wayback-content-error | 1233
+ cdx-error | 1198
+ petabox-error | 617
+ gateway-timeout | 395
+ not-found | 130
+ read-timeout | 128
+ | 111
+ invalid-host-resolution | 63
+ spn2-cdx-lookup-failure | 24
+ spn2-error | 20
+ (20 rows)
+
+## Generate Seedlist
+
+For crawling, do a similar (but not identical) dump:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json';
+ => COPY 4599519
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json
+ => 4.60M 0:02:55 [26.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+ cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ => DONE
+
+ wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt
+ 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+
+## Post-Crawl Bulk Re-Ingest
+
+Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by
+hash, URL agnostic).
+
+Enqueue for buik re-ingest:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => Thu 19 Aug 2021 09:10:59 PM UTC
+
+
+## Post-Ingest Stats
+
+Just the new stuff (compare against above for delta):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 7748241 89.2%
+ no-capture | 429688 4.9%
+ redirect-loop | 172831 2.0%
+ terminal-bad-status | 94029 1.1%
+ no-pdf-link | 86437 1.0%
+ blocked-cookie | 67903 0.8%
+ link-loop | 50622
+ wrong-mimetype | 21064
+ null-body | 6650
+ cdx-error | 3313
+ wayback-error | 2630
+ gateway-timeout | 399
+ petabox-error | 268
+ wayback-content-error | 170
+ not-found | 130
+ read-timeout | 128
+ | 109
+ invalid-host-resolution | 63
+ bad-redirect | 39
+ spn2-error | 20
+ (20 rows)
+
+New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397
+
+Overall success of new batch: 7748241. / 8686315 = 89.2%
+
+And combined (old and new) status again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 31990062
+ redirect-loop | 1704717
+ no-capture | 1263462
+ link-loop | 1218280
+ blocked-cookie | 1213838
+ no-pdf-link | 1096664
+ terminal-bad-status | 960070
+ gateway-timeout | 202190
+ wrong-mimetype | 86557
+ invalid-host-resolution | 37262
+ null-body | 15443
+ wayback-error | 12839
+ cdx-error | 4047
+ spn2-error | 1731
+ spn2-error:job-failed | 962
+ petabox-error | 463
+ wayback-content-error | 379
+ spn2-error:invalid-url-syntax | 336
+ spn2-error:soft-time-limit-exceeded | 203
+ | 175
+ (20 rows)
+
+New success total: 31990062 - 26123975 = 5,866,087
+
+A full 1,263,462 no-capture that could be attempted... though many of those may
+be excluded for a specific reason.
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
new file mode 100644
index 0000000..fded7b3
--- /dev/null
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -0,0 +1,1578 @@
+
+Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially
+re-crawling content which failed to ingest the first time.
+
+May fold this in with more general patch crawling.
+
+## Basic Counts
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 14145387
+ no-pdf-link | 12063022
+ no-capture | 5485640
+ redirect-loop | 2092705
+ terminal-bad-status | 747372
+ wrong-mimetype | 597219
+ link-loop | 542144
+ null-body | 93566
+ cdx-error | 19798
+ petabox-error | 17943
+ | 15283
+ wayback-error | 13897
+ gateway-timeout | 511
+ skip-url-blocklist | 184
+ wayback-content-error | 146
+ bad-redirect | 137
+ redirects-exceeded | 120
+ bad-gzip-encoding | 116
+ timeout | 80
+ blocked-cookie | 64
+ (20 rows)
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 40;
+
+
+ oai_prefix | success | total
+ ---------------------------+---------+---------
+ repec | 1133175 | 2783448
+ hal | 573218 | 1049607
+ www.irgrid.ac.cn | 18007 | 748828
+ cds.cern.ch | 74078 | 688091
+ americanae.aecid.es | 71310 | 572792
+ juser.fz-juelich.de | 23026 | 518551
+ espace.library.uq.edu.au | 6649 | 508960
+ igi.indrastra.com | 59629 | 478577
+ archive.ugent.be | 65306 | 424014
+ hrcak.srce.hr | 404085 | 414897
+ zir.nsk.hr | 156753 | 397200
+ renati.sunedu.gob.pe | 79362 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7997 | 354529
+ generic.eprints.org | 263566 | 340470
+ invenio.nusl.cz | 6340 | 325867
+ evastar-karlsruhe.de | 62282 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ diva.org | 67917 | 298348
+ t2r2.star.titech.ac.jp | 1085 | 289388
+ edpsciences.org | 139495 | 284972
+ repository.ust.hk | 10245 | 283417
+ revues.org | 151156 | 277497
+ pure.atira.dk | 13492 | 260754
+ bibliotecadigital.jcyl.es | 50606 | 254134
+ escholarship.org/ark | 140835 | 245203
+ ojs.pkp.sfu.ca | 168029 | 229387
+ lup.lub.lu.se | 49358 | 226602
+ library.wur.nl | 15051 | 216738
+ digitalrepository.unm.edu | 111704 | 211749
+ infoscience.tind.io | 60166 | 207299
+ edoc.mpg.de | 0 | 205252
+ erudit.org | 168490 | 197803
+ delibra.bg.polsl.pl | 38666 | 196652
+ n/a | 0 | 193814
+ aleph.bib-bvb.de | 4349 | 186666
+ serval.unil.ch | 41643 | 186372
+ orbi.ulg.ac.be | 2400 | 184551
+ digitalcommons.unl.edu | 144025 | 184372
+ bib-pubdb1.desy.de | 33525 | 182717
+ (40 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 50;
+
+ oai_prefix | status | count
+ ---------------------------+---------------+---------
+ repec | success | 1133175
+ repec | no-pdf-link | 638105
+ hal | success | 573218
+ cds.cern.ch | no-capture | 540380
+ repec | redirect-loop | 516451
+ juser.fz-juelich.de | no-pdf-link | 477881
+ americanae.aecid.es | no-pdf-link | 417766
+ hrcak.srce.hr | success | 404085
+ www.irgrid.ac.cn | no-pdf-link | 370908
+ hal | no-pdf-link | 359252
+ www.irgrid.ac.cn | no-capture | 355532
+ espace.library.uq.edu.au | no-pdf-link | 320479
+ igi.indrastra.com | no-pdf-link | 318242
+ repec | no-capture | 316981
+ invenio.nusl.cz | no-pdf-link | 309802
+ rour.neicon.ru | redirect-loop | 300911
+ hypotheses.org | no-pdf-link | 300251
+ renati.sunedu.gob.pe | no-capture | 282800
+ t2r2.star.titech.ac.jp | no-pdf-link | 272045
+ generic.eprints.org | success | 263566
+ quod.lib.umich.edu | no-pdf-link | 259661
+ archive.ugent.be | no-capture | 256127
+ evastar-karlsruhe.de | no-pdf-link | 248939
+ zir.nsk.hr | link-loop | 226919
+ repository.ust.hk | no-pdf-link | 208569
+ edoc.mpg.de | no-pdf-link | 199758
+ bibliotecadigital.jcyl.es | no-pdf-link | 188433
+ orbi.ulg.ac.be | no-pdf-link | 172373
+ diva.org | no-capture | 171115
+ lup.lub.lu.se | no-pdf-link | 168652
+ erudit.org | success | 168490
+ ojs.pkp.sfu.ca | success | 168029
+ lib.dr.iastate.edu | success | 158494
+ zir.nsk.hr | success | 156753
+ digital.kenyon.edu | success | 154900
+ revues.org | success | 151156
+ books.openedition.org | no-pdf-link | 149607
+ freidok.uni-freiburg.de | no-pdf-link | 146837
+ digitalcommons.unl.edu | success | 144025
+ escholarship.org/ark | success | 140835
+ culeuclid | link-loop | 140291
+ edpsciences.org | success | 139495
+ serval.unil.ch | no-pdf-link | 138644
+ bib-pubdb1.desy.de | no-pdf-link | 133815
+ krm.or.kr | no-pdf-link | 132461
+ pure.atira.dk | no-pdf-link | 132179
+ oai-gms.dimdi.de | redirect-loop | 131409
+ aleph.bib-bvb.de | no-capture | 128261
+ library.wur.nl | no-pdf-link | 124718
+ lirias2repo.kuleuven.be | no-capture | 123106
+ (50 rows)
+
+Note: could just delete the "excluded" rows? and not harvest them in the
+future, and filter them at ingest time (in transform script).
+
+
+
+## Investigate no-pdf-link sandcrawler improvements
+
+Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works:
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%'
+ ORDER BY random()
+ LIMIT 10;
+
+Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works):
+
+ \x auto
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ ORDER BY random()
+ LIMIT 30;
+
+### repec (SKIP-PREFIX)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35
+base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html
+terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647
+base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf
+terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75
+base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec
+terminal_url | https://www.jstor.org/stable/1884373
+
+Huh! This is just a catalog of other domains. Should probably skip
+
+DONE: skip/filter repec
+
+### juser.fz-juelich.de (SCOPE)
+
+-[ RECORD 1 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:132217
+base_url | http://juser.fz-juelich.de/record/132217
+terminal_url | http://juser.fz-juelich.de/record/132217
+
+Poster; no files.
+
+-[ RECORD 2 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:268598
+base_url | http://juser.fz-juelich.de/record/268598
+terminal_url | http://juser.fz-juelich.de/record/268598
+
+Journal.
+
+-[ RECORD 3 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:126613
+base_url | http://juser.fz-juelich.de/record/126613
+terminal_url | http://juser.fz-juelich.de/record/126613
+
+-[ RECORD 4 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:67362
+base_url | http://juser.fz-juelich.de/record/67362
+terminal_url | http://juser.fz-juelich.de/record/67362
+-[ RECORD 5 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:869189
+base_url | http://juser.fz-juelich.de/record/869189
+terminal_url | http://juser.fz-juelich.de/record/869189
+-[ RECORD 6 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:810746
+base_url | http://juser.fz-juelich.de/record/810746
+terminal_url | http://juser.fz-juelich.de/record/810746
+-[ RECORD 7 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:52897
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+-[ RECORD 8 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:114755
+base_url | http://juser.fz-juelich.de/record/114755
+terminal_url | http://juser.fz-juelich.de/record/114755
+-[ RECORD 9 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:58025
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+
+The search URLs seem redundant? Not going to try to handle those.
+
+"Powered by Invenio v1.1.7"
+
+All of these examples seem to be not papers. Maybe we can filter these better
+at the harvest or transform stage?
+
+### americanae.aecid.es (MIXED)
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:502896
+base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+
+just a metadata record? links to redalyc
+
+METADATA-ONLY
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:534600
+base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:524567
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+
+NOT-FOUND (404)
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:378914
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+
+Some single-page image archival thing? bespoke, skipping.
+
+SKIP-BESPOKE
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:526142
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+
+NOT-FOUND (404)
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:373408
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+
+NOT-FOUND (404)
+
+### www.irgrid.ac.cn (SKIP-PREFIX)
+
+Chinese Academy of Sciences Institutional Repositories Grid
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1749980
+base_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+
+Can't access
+
+FORBIDDEN
+
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/857397
+base_url | http://www.irgrid.ac.cn/handle/1471x/857397
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397
+
+Just linking to another IR; skip it.
+
+http://ir.ipe.ac.cn/handle/122111/10608
+
+requires login
+
+DONE: '/password-login;jsessionid' as a loginwall URL pattern
+ http://ir.ipe.ac.cn/handle/122111/10608
+ http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf
+
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1060447
+base_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1671377
+base_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1178430
+base_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2488017
+base_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/977147
+base_url | http://www.irgrid.ac.cn/handle/1471x/977147
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2454503
+base_url | http://ir.nwipb.ac.cn/handle/363003/9957
+terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957
+
+this domain is a disapointment :(
+
+should continue crawling, as the metadata is open and good. but won't get fulltext?
+
+### hal (FIXED-PARTIAL)
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00744951v1
+base_url | https://hal.archives-ouvertes.fr/hal-00744951
+terminal_url | https://hal.archives-ouvertes.fr/hal-00744951
+
+Off-site OA link.
+
+FIXED-HAL
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-01065398v1
+base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf
+terminal_url | https://hal.archives-ouvertes.fr/index/index
+-[ RECORD 3 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:lirmm-00371599v1
+base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+
+To elsevier :(
+
+-[ RECORD 4 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00284780v1
+base_url | https://hal.archives-ouvertes.fr/hal-00284780
+terminal_url | https://hal.archives-ouvertes.fr/hal-00284780
+
+METADATA-ONLY
+
+-[ RECORD 5 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00186151v1
+base_url | https://hal.archives-ouvertes.fr/hal-00186151
+terminal_url | https://hal.archives-ouvertes.fr/hal-00186151
+
+METADATA-ONLY
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00399754v1
+base_url | https://hal.archives-ouvertes.fr/hal-00399754
+terminal_url | https://hal.archives-ouvertes.fr/hal-00399754
+
+METADATA-ONLY
+
+
+### espace.library.uq.edu.au (SKIP)
+
+-[ RECORD 1 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:136497
+base_url | https://espace.library.uq.edu.au/view/UQ:136497
+terminal_url | https://espace.library.uq.edu.au/view/UQ:136497
+-[ RECORD 2 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:411389
+base_url | https://espace.library.uq.edu.au/view/UQ:411389
+terminal_url | https://espace.library.uq.edu.au/view/UQ:411389
+-[ RECORD 3 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:401773
+base_url | https://espace.library.uq.edu.au/view/UQ:401773
+terminal_url | https://espace.library.uq.edu.au/view/UQ:401773
+-[ RECORD 4 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:675334
+base_url | https://espace.library.uq.edu.au/view/UQ:675334
+terminal_url | https://espace.library.uq.edu.au/view/UQ:675334
+-[ RECORD 5 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:312311
+base_url | https://espace.library.uq.edu.au/view/UQ:312311
+terminal_url | https://espace.library.uq.edu.au/view/UQ:312311
+-[ RECORD 6 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:209401
+base_url | https://espace.library.uq.edu.au/view/UQ:209401
+terminal_url | https://espace.library.uq.edu.au/view/UQ:209401
+-[ RECORD 7 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:327188
+base_url | https://espace.library.uq.edu.au/view/UQ:327188
+terminal_url | https://espace.library.uq.edu.au/view/UQ:327188
+
+Very javascript heavy (skeletal HTML). And just links to fulltext on publisher
+sites.
+
+### igi.indrastra.com (METADATA-ONLY)
+
+-[ RECORD 1 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:267221
+base_url | http://igi.indrastra.com/items/show/267221
+terminal_url | http://igi.indrastra.com/items/show/267221
+-[ RECORD 2 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:181799
+base_url | http://igi.indrastra.com/items/show/181799
+terminal_url | http://igi.indrastra.com/items/show/181799
+-[ RECORD 3 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:125382
+base_url | http://igi.indrastra.com/items/show/125382
+terminal_url | http://igi.indrastra.com/items/show/125382
+-[ RECORD 4 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:47266
+base_url | http://igi.indrastra.com/items/show/47266
+terminal_url | http://igi.indrastra.com/items/show/47266
+-[ RECORD 5 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:12872
+base_url | http://igi.indrastra.com/items/show/12872
+terminal_url | http://igi.indrastra.com/items/show/12872
+-[ RECORD 6 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:231620
+base_url | http://igi.indrastra.com/items/show/231620
+terminal_url | http://igi.indrastra.com/items/show/231620
+
+"Proudly powered by Omeka"
+
+### invenio.nusl.cz (METADATA-ONLY)
+
+ oai_id | base_url | terminal_url
+----------------------------+------------------------------------+--------------------------------------
+ oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409
+ oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783
+ oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961
+ oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800
+ oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695
+ oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393
+ oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987
+ oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396
+ oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512
+ oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631
+
+Metadata only (at least this set)
+
+### hypotheses.org
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:mittelalter/9529
+base_url | http://mittelalter.hypotheses.org/9529
+terminal_url | https://mittelalter.hypotheses.org/9529
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/18638
+base_url | http://archivalia.hypotheses.org/18638
+terminal_url | https://archivalia.hypotheses.org/18638
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/13614
+base_url | http://archivalia.hypotheses.org/13614
+terminal_url | https://archivalia.hypotheses.org/13614
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:teteschercheuses/2785
+base_url | http://teteschercheuses.hypotheses.org/2785
+terminal_url | https://teteschercheuses.hypotheses.org/2785
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:altervsego/608
+base_url | http://altervsego.hypotheses.org/608
+terminal_url | http://altervsego.hypotheses.org/608
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivewk1/21905
+base_url | http://archivewk1.hypotheses.org/21905
+terminal_url | https://archivewk1.hypotheses.org/21905
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:slkdiaspo/3321
+base_url | http://slkdiaspo.hypotheses.org/3321
+terminal_url | https://slkdiaspo.hypotheses.org/3321
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:diga/280
+base_url | http://diga.hypotheses.org/280
+terminal_url | https://diga.hypotheses.org/280
+
+These are all a big mix... basically blogs. Should continue crawling, but expect no yield.
+
+### t2r2.star.titech.ac.jp (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00105099
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00101346
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50161100
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00232407
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50120040
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50321440
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50235666
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+
+
+### quod.lib.umich.edu
+
+-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2
+base_url | http://name.umdl.umich.edu/acf2679.0015.003
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:b14970.0001.001
+base_url | http://name.umdl.umich.edu/B14970.0001.001
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3
+base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43
+base_url | http://name.umdl.umich.edu/acg2248.1-16.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9
+base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9
+base_url | http://name.umdl.umich.edu/acg1336.1-24.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a
+base_url | http://name.umdl.umich.edu/africanamer.0002.32a
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a
+
+These are... issues of journals? Should continue to crawl, but not expect much.
+
+### evastar-karlsruhe.de (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:270011444
+base_url | https://publikationen.bibliothek.kit.edu/270011444
+terminal_url | https://publikationen.bibliothek.kit.edu/270011444
+-[ RECORD 2 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000050117
+base_url | https://publikationen.bibliothek.kit.edu/1000050117
+terminal_url | https://publikationen.bibliothek.kit.edu/1000050117
+-[ RECORD 3 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:362296
+base_url | https://publikationen.bibliothek.kit.edu/362296
+terminal_url | https://publikationen.bibliothek.kit.edu/362296
+-[ RECORD 4 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:23042000
+base_url | https://publikationen.bibliothek.kit.edu/23042000
+terminal_url | https://publikationen.bibliothek.kit.edu/23042000
+-[ RECORD 5 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000069945
+base_url | https://publikationen.bibliothek.kit.edu/1000069945
+terminal_url | https://publikationen.bibliothek.kit.edu/1000069945
+
+
+### repository.ust.hk
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-67233
+base_url | http://repository.ust.hk/ir/Record/1783.1-67233
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-63232
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017
+terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-2891
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103
+terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-56231
+base_url | http://repository.ust.hk/ir/Record/1783.1-56231
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231
+
+[...]
+
+-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-24872
+base_url | http://repository.ust.hk/ir/Record/1783.1-24872
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872
+-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-3457
+base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-73215
+base_url | http://repository.ust.hk/ir/Record/1783.1-73215
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215
+
+DONE: gateway.isiknowledge.com is bogus/blocking?
+
+
+### edoc.mpg.de (SKIP-DEPRECATED)
+
+ oai_id | base_url | terminal_url
+------------------------+---------------------------+---------------------------
+ oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650
+ oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195
+ oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655
+ oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179
+ oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141
+ oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412
+ oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531
+ oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047
+ oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650
+ oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852
+
+This whole instance seems to have been replaced
+
+### bibliotecadigital.jcyl.es (SKIP-DIGITIZED)
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000039962
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+-[ RECORD 2 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14075
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+-[ RECORD 3 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:4842
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+-[ RECORD 4 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14799
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+-[ RECORD 5 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:821
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+
+Digitized images as pages; too much to deal with for now.
+
+### orbi.ulg.ac.be
+
+-[ RECORD 1 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/128079
+base_url | https://orbi.uliege.be/handle/2268/128079
+terminal_url | https://orbi.uliege.be/handle/2268/128079
+-[ RECORD 2 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/67659
+base_url | https://orbi.uliege.be/handle/2268/67659
+terminal_url | https://orbi.uliege.be/handle/2268/67659
+-[ RECORD 3 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/35521
+base_url | https://orbi.uliege.be/handle/2268/35521
+terminal_url | https://orbi.uliege.be/handle/2268/35521
+-[ RECORD 4 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/107922
+base_url | https://orbi.uliege.be/handle/2268/107922
+terminal_url | https://orbi.uliege.be/handle/2268/107922
+-[ RECORD 5 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/215694
+base_url | https://orbi.uliege.be/handle/2268/215694
+terminal_url | https://orbi.uliege.be/handle/2268/215694
+
+Described below.
+
+### library.wur.nl (FIXED-BESPOKE)
+
+ oai_id | base_url | terminal_url
+ -----------------------------------+------------------------------------------------+------------------------------------------------
+ oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939
+ oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707
+ oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208
+ oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378
+ oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416
+ oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930
+ oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076
+ oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109
+ oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146
+ oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922
+ (10 rows)
+
+Seems like a one-off site? But added a pattern.
+
+### pure.atira.dk
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38
+base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694
+terminal_url | https://www.tandfonline.com/action/cookieAbsent
+-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+
+Metadata only
+
+DONE: /cookieAbsent is cookie block
+ https://www.tandfonline.com/action/cookieAbsent
+
+### bib-pubdb1.desy.de (FIXED-INVENIO)
+
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:96756
+base_url | http://bib-pubdb1.desy.de/record/96756
+terminal_url | http://bib-pubdb1.desy.de/record/96756
+
+Metadata only.
+
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:416556
+base_url | http://bib-pubdb1.desy.de/record/416556
+terminal_url | http://bib-pubdb1.desy.de/record/416556
+
+Fixed!
+
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:414545
+base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:170169
+base_url | http://bib-pubdb1.desy.de/record/170169
+terminal_url | http://bib-pubdb1.desy.de/record/170169
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:191154
+base_url | http://bib-pubdb1.desy.de/record/191154
+terminal_url | http://bib-pubdb1.desy.de/record/191154
+
+Metadata only
+
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:155092
+base_url | http://bib-pubdb1.desy.de/record/155092
+terminal_url | http://bib-pubdb1.desy.de/record/155092
+
+Fixed!
+
+-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:97158
+base_url | http://bib-pubdb1.desy.de/record/97158
+terminal_url | http://bib-pubdb1.desy.de/record/97158
+
+Metadata only
+
+"Powered by Invenio v1.1.7"
+
+Can/should skip the "search" URLs
+
+### serval.unil.ch
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_60346fc75171
+base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_4db47fc4b593
+base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_57aac24fe115
+base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_deabae6baf6c
+base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_a5ec0df1370f
+base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_080300c2e23c
+base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_de777dd2b07f
+base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F
+-[ RECORD 8 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_5e824e244c27
+base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27
+
+Metadata only? See elsewhere.
+
+### Random Links
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dbc.wroc.pl:41031
+base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+
+This is some platform/package thing. PDF is in an iframe. Platform is "DLibra".
+FIXED-DLIBRA
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/174291
+base_url | https://orbi.uliege.be/handle/2268/174291
+terminal_url | https://orbi.uliege.be/handle/2268/174291
+
+DSpace platform. There are multiple files, and little to "select" on.
+
+https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with
+
+PARTIAL-DSPACE
+
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.tue.nl:664163
+base_url | http://repository.tue.nl/664163
+terminal_url | http://repository.tue.nl/664163
+
+Ah, this is the Pure platform from Elsevier.
+Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance
+
+FIXED-PURE
+
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:49579
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+
+(handled above)
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/97937
+base_url | https://orcid.org/0000-0002-2066-2082
+terminal_url | https://orcid.org/0000-0002-2066-2082
+
+ORCID! Skip it.
+
+DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time.
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:edoc.mpg.de:360269
+base_url | http://edoc.mpg.de/360269
+terminal_url | http://edoc.mpg.de/360269
+
+Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure?
+
+DONE: edoc.mpg.de -> pure.mpg.de
+
+-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:books.openedition.org:msha/17716
+base_url | http://books.openedition.org/msha/17716
+terminal_url | https://books.openedition.org/msha/17716
+
+Open edition is free to read HTML, but not PDF (or epub, etc).
+
+TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest)
+
+HTML-WORKED
+
+-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epub.oeaw.ac.at:0x003aba48
+base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+
+requires login
+
+FORBIDDEN
+
+-[ RECORD 9 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/88986
+base_url | https://orcid.org/0000-0002-4147-2560
+terminal_url | https://orcid.org/0000-0002-4147-2560
+
+DONE: skip orcids
+
+-[ RECORD 10 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-28786
+base_url | http://repository.ust.hk/ir/Record/1783.1-28786
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786
+
+Generator: VuFind 5.1.1
+just a metadata record
+
+METADATA-ONLY
+
+-[ RECORD 11 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:rcin.org.pl:50797
+base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+
+Seems like a software platform? not sure.
+
+METADATA-ONLY
+
+-[ RECORD 12 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dea.lib.unideb.hu:2437/69641
+base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+
+-[ RECORD 13 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871
+base_url | http://handle.unsw.edu.au/1959.4/64871
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L
+
+-[ RECORD 14 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.wbc.poznan.pl:225930
+base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+
+SOFT-404
+
+-[ RECORD 15 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.erciyes.edu.tr:105
+base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105
+terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105
+
+GONE (domain not registered)
+
+-[ RECORD 16 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:37500
+base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+
+Seems like a bespoke site
+
+SKIP-BESPOKE
+
+-[ RECORD 17 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50401364
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+
+METADATA-ONLY
+
+-[ RECORD 18 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epubs.cclrc.ac.uk:work/4714
+base_url | http://purl.org/net/epubs/work/4714
+terminal_url | https://epubs.stfc.ac.uk/work/4714
+
+It's got a purl! haha.
+
+METADATA-ONLY
+
+------
+
+Another batch! With some repeat domains removed.
+
+-[ RECORD 1 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc
+base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc
+terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov
+
+SKIP
+
+-[ RECORD 2 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-05302014-183910
+base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+
+Some software platform? Pretty basic/bespoke
+
+FIXED-PARTIAL
+
+-[ RECORD 3 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000098246
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+
+SKIP (see elsewhere)
+
+-[ RECORD 7 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:elektra.cdaea.es:documento.29259
+base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+
+Photo.
+
+SKIP-SCOPE
+
+-[ RECORD 9 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829
+base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L
+
+METADATA-ONLY
+
+-[ RECORD 12 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a
+base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+
+unsure
+
+-[ RECORD 16 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/369344
+base_url | https://library.wur.nl/WebQuery/wurpubs/369344
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344
+
+this specific record not OA (but site is fine/fixed)
+
+-[ RECORD 17 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:escholarship.umassmed.edu:oapubs-2146
+base_url | https://escholarship.umassmed.edu/oapubs/1147
+terminal_url | http://escholarship.umassmed.edu/oapubs/1147/
+
+just links to publisher (no content in repo)
+
+-[ RECORD 18 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010
+base_url | https://digitalcommons.usu.edu/wild_facpub/11
+terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/
+
+also just links to publisher (no content in repo)
+
+-[ RECORD 25 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:igi.indrastra.com:306768
+base_url | http://igi.indrastra.com/items/show/306768
+terminal_url | http://igi.indrastra.com/items/show/306768
+
+(see elsewhere)
+
+-[ RECORD 26 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:fau.digital.flvc.org:fau_9804
+base_url | http://purl.flvc.org/fcla/dt/12932
+terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804
+
+Islandora.
+
+-[ RECORD 27 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.lu.lv:7/16019
+base_url | https://dspace.lu.lv/dspace/handle/7/16019
+terminal_url | https://dspace.lu.lv/dspace/handle/7/16019
+
+LOGINWALL
+
+-[ RECORD 28 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:zir.nsk.hr:umas_218
+base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+
+REMOVED
+
+
+-[ RECORD 29 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:36390
+base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+
+Book, with chapters, not an individual work.
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:krm.or.kr:10056135m201r
+base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y
+terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135
+
+research results repository; keep crawling
+
+SKIP-SCOPE
+
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.db-thueringen.de:dbt_mods_00005191
+base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+
+powered by "MyCoRe"
+
+FIXED-MYCORE
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405
+base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+
+seems to be a general purpose regional library? not research-specific
+
+SKIP-UNSURE
+
+-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-02272019-123644
+base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+
+This specific URL is not available (FORBIDDEN)
+
+others have multiple files, not just a single PDF:
+https://etd.adm.unipi.it/t/etd-09102013-124430/
+
+SKIP-UNSURE
+
+-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:commons.ln.edu.hk:sw_master-5408
+base_url | https://commons.ln.edu.hk/sw_master/4408
+terminal_url | https://commons.ln.edu.hk/sw_master/4408/
+
+worth crawling I guess
+
+METADATA-ONLY
+
+-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:mouseion.jax.org:ssbb1976-1224
+base_url | https://mouseion.jax.org/ssbb1976/225
+terminal_url | https://mouseion.jax.org/ssbb1976/225/
+
+METADATA-ONLY
+
+-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aleph.bib-bvb.de:bvb01-016604343
+base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer
+terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true
+
+SOFT-404 / FORBIDDEN (cookie timeout)
+
+-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bivaldi.gva.es:11740
+base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+
+
+-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/443282
+base_url | https://library.wur.nl/WebQuery/wurpubs/443282
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282
+
+DIGIBIS platform (like some others)
+
+FIXED-PARTIAL
+
+-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:hal:in2p3-00414135v1
+base_url | http://hal.in2p3.fr/in2p3-00414135
+terminal_url | http://hal.in2p3.fr:80/in2p3-00414135
+
+METADATA-ONLY
+
+-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aaltodoc.aalto.fi:123456789/13201
+base_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+
+This specific record is not accessible.
+Another: https://aaltodoc.aalto.fi/handle/123456789/38002
+
+DSpace 5.4
+
+Worked (from recent changes)
+
+
+-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:sedici.unlp.edu.ar:10915/40144
+base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+
+This is a journal! Cool. Plone software platform.
+
+FIXED
+
+## Top no-capture Domains
+
+Top terminal no-capture domains:
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | count
+ -----------------------------------+-------
+ digitalrepository.unm.edu | 94087
+ escholarship.org | 80632
+ ir.opt.ac.cn | 70504
+ idus.us.es | 67908
+ www.cambridge.org | 56376
+ www.ssoar.info | 52534
+ rep.bntu.by | 52127
+ scholarworks.umt.edu | 48546
+ publikationen.ub.uni-frankfurt.de | 46987
+ dk.um.si | 45753
+ repositorio.uladech.edu.pe | 37028
+ uu.diva-portal.org | 34929
+ digitalcommons.law.byu.edu | 31732
+ sedici.unlp.edu.ar | 31233
+ elib.sfu-kras.ru | 29131
+ jyx.jyu.fi | 28144
+ www.repository.cam.ac.uk | 27728
+ nagoya.repo.nii.ac.jp | 26673
+ www.duo.uio.no | 25258
+ www.persee.fr | 24968
+ www2.senado.leg.br | 24426
+ tesis.ucsm.edu.pe | 24049
+ digitalcommons.unl.edu | 21974
+ www.degruyter.com | 21940
+ www.igi-global.com | 20736
+ thekeep.eiu.edu | 20712
+ docs.lib.purdue.edu | 20538
+ repositorio.cepal.org | 20280
+ elib.bsu.by | 19620
+ minds.wisconsin.edu | 19473
+ (30 rows)
+
+These all seem worth crawling. A couple publishers (cambridge.org), and
+persee.fr will probably fail, but not too many URLs.
+
+## Summary of Filtered Prefixes and Domains (OAI-PMH)
+
+oai:kb.dk:
+ too large and generic
+oai:bdr.oai.bsb-muenchen.de:
+ too large and generic
+oai:hispana.mcu.es:
+ too large and generic
+oai:bnf.fr:
+ too large and generic
+oai:ukm.si:
+ too large and generic
+oai:biodiversitylibrary.org:
+ redundant with other ingest and archive.org content
+oai:hsp.org:
+ large; historical content only
+oai:repec:
+ large; mostly (entirely?) links to publisher sites
+oai:n/a:
+ meta?
+oai:quod.lib.umich.edu:
+ entire issues? hard to crawl so skip for now
+oai:hypotheses.org:
+ HTML, not PDF
+oai:americanae.aecid.es:
+ large, complex. skip for now
+oai:www.irgrid.ac.cn:
+ aggregator of other IRs
+oai:espace.library.uq.edu.au:
+ large; metadata only; javascript heavy (poor heritrix crawling)
+oai:edoc.mpg.de:
+ deprecated domain, with no redirects
+oai:bibliotecadigital.jcyl.es:
+ digitized historical docs; hard to crawl, skip for now
+oai:repository.erciyes.edu.tr:
+ gone (domain lapsed)
+oai:krm.or.kr:
+ "research results repository" (metadata only)
+
+www.kb.dk
+ large, general purpose, scope
+kb-images.kb.dk
+ deprecated
+mdz-nbn-resolving.de
+ multiple prefixes end up here. historical docs, scope
+aggr.ukm.um.si
+ large, out of scope
+edoc.mpg.de
+ deprecated domain
+doaj.org
+ index (metadata only)
+orcid.org
+ out of scope
+gateway.isiknowledge.com
+ clarivate login/payall (skipping in ingest)
+
+Needs filtering to a subset of records (by 'set' or other filtering?):
+
+oai:igi.indrastra.com:
+oai:invenio.nusl.cz:
+oai:t2r2.star.titech.ac.jp:
+oai:evastar-karlsruhe.de:
+oai:repository.ust.hk:
+oai:serval.unil.ch:
+oai:pure.atira.dk:
+
+FIlters in SQL syntax:
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+
+and in some contexts (PDFs; switch to HTML):
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+## Overall Summary of OAI-PMH Stuff
+
+Big picture is that the majority of `no-pdf-link` crawl status are because of
+repository scope, record scope, or content format issues. That being said,
+there was a sizable fraction of sites which were platforms (like DSpace) which
+were not ingesting well.
+
+A significant fraction of records are "metadata only" (of papers), or non-paper
+entity types (like persons, grants, or journal titles), and a growing fraction
+(?) are metadata plus link to OA publisher fulltext (offsite). Might be
+possible to detect these at ingest time, or earlier at OAI-PMH
+harvest/transform time and filter them out.
+
+It may be worthwhile to attempt ingest of multiple existing captures
+(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best"
+capture, if therea are multiple HTTP 200 status captures, try ingest with each
+(or at least a couple). This is because repository software gets upgraded, so
+old "no-capture" or "not found" or "link loop" type captures may work when
+recrawled.
+
+New summary with additional filters:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 12872279
+ no-pdf-link | 9329602
+ no-capture | 4696362
+ redirect-loop | 1541458
+ terminal-bad-status | 660418
+ link-loop | 452831
+ wrong-mimetype | 434868
+ null-body | 71065
+ cdx-error | 17005
+ | 15275
+ petabox-error | 12743
+ wayback-error | 11759
+ skip-url-blocklist | 182
+ gateway-timeout | 122
+ redirects-exceeded | 120
+ bad-redirect | 117
+ bad-gzip-encoding | 111
+ wayback-content-error | 102
+ timeout | 72
+ blocked-cookie | 62
+ (20 rows)
+
diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md
new file mode 100644
index 0000000..a0bb0c5
--- /dev/null
+++ b/notes/ingest/2021-09-03_daily_improvements.md
@@ -0,0 +1,1021 @@
+
+Periodic check-in of daily crawling/ingest.
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------+--------
+ pdf | no-pdf-link | 158474
+ pdf | spn2-cdx-lookup-failure | 135344
+ pdf | success | 127938
+ pdf | spn2-error | 65411
+ pdf | gateway-timeout | 63112
+ pdf | blocked-cookie | 26338
+ pdf | terminal-bad-status | 24853
+ pdf | link-loop | 15699
+ pdf | spn2-error:job-failed | 13862
+ pdf | redirect-loop | 11432
+ pdf | cdx-error | 2376
+ pdf | too-many-redirects | 2186
+ pdf | wrong-mimetype | 2142
+ pdf | forbidden | 1758
+ pdf | spn2-error:no-status | 972
+ pdf | not-found | 820
+ pdf | bad-redirect | 536
+ pdf | read-timeout | 392
+ pdf | wayback-error | 251
+ pdf | remote-server-error | 220
+ (20 rows)
+
+Hrm, that is a healthy fraction of `no-pdf-link`.
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 39678
+ osf.io | gateway-timeout | 29809
+ acervus.unicamp.br | no-pdf-link | 21978
+ osf.io | terminal-bad-status | 18727
+ zenodo.org | spn2-cdx-lookup-failure | 17008
+ doi.org | spn2-cdx-lookup-failure | 15503
+ www.degruyter.com | no-pdf-link | 15122
+ ieeexplore.ieee.org | spn2-error:job-failed | 12921
+ osf.io | spn2-cdx-lookup-failure | 11123
+ www.tandfonline.com | blocked-cookie | 8096
+ www.morressier.com | no-pdf-link | 4655
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580
+ pubs.acs.org | blocked-cookie | 4415
+ www.frontiersin.org | no-pdf-link | 4163
+ www.degruyter.com | spn2-cdx-lookup-failure | 3788
+ www.taylorfrancis.com | no-pdf-link | 3568
+ www.sciencedirect.com | no-pdf-link | 3128
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116
+ acervus.unicamp.br | spn2-cdx-lookup-failure | 2797
+ www.mdpi.com | spn2-cdx-lookup-failure | 2719
+ brill.com | link-loop | 2681
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 2546
+ apps.crossref.org | no-pdf-link | 2537
+ onlinelibrary.wiley.com | blocked-cookie | 2528
+ (25 rows)
+
+Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status != 'spn2-cdx-lookup-failure'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+
+ domain | status | count
+ -----------------------------------------------------------------+-----------------------+--------
+ academic.oup.com | | 2405
+ academic.oup.com | no-pdf-link | 1240
+ academic.oup.com | link-loop | 1010
+ acervus.unicamp.br | | 21980
+ acervus.unicamp.br | no-pdf-link | 21978 **
+ aclanthology.org | | 208
+ acp.copernicus.org | | 365
+ acp.copernicus.org | success | 356
+ aip.scitation.org | | 1071
+ aip.scitation.org | blocked-cookie | 843
+ aip.scitation.org | redirect-loop | 227
+ apps.crossref.org | | 2537
+ apps.crossref.org | no-pdf-link | 2537
+ arxiv.org | | 17817
+ arxiv.org | success | 17370
+ arxiv.org | terminal-bad-status | 320
+ asmedigitalcollection.asme.org | | 401
+ asmedigitalcollection.asme.org | link-loop | 364
+ assets.researchsquare.com | | 3706
+ assets.researchsquare.com | success | 3706
+ avmj.journals.ekb.eg | | 605
+ avmj.journals.ekb.eg | success | 595
+ bfa.journals.ekb.eg | | 224
+ bfa.journals.ekb.eg | success | 214
+ biorxiv.org | redirect-loop | 895
+ biorxiv.org | | 895
+ birdsoftheworld.org | | 286
+ birdsoftheworld.org | no-pdf-link | 285
+ bmjopen.bmj.com | success | 232
+ bmjopen.bmj.com | | 232
+ books.openedition.org | | 396
+ books.openedition.org | no-pdf-link | 396
+ brill.com | | 4272
+ brill.com | link-loop | 2681
+ brill.com | no-pdf-link | 1410
+ cas.columbia.edu | | 1038
+ cas.columbia.edu | no-pdf-link | 1038 **
+ cdr.lib.unc.edu | | 513
+ cdr.lib.unc.edu | success | 469
+ chemrxiv.org | | 278
+ chemrxiv.org | success | 275
+ classiques-garnier.com | | 531
+ classiques-garnier.com | no-pdf-link | 487 *
+ content.iospress.com | | 275
+ content.iospress.com | link-loop | 230
+ cris.maastrichtuniversity.nl | | 318
+ cris.maastrichtuniversity.nl | success | 284
+ cyberleninka.ru | | 1165
+ cyberleninka.ru | success | 1134
+ deepblue.lib.umich.edu | | 289
+ dergipark.org.tr | | 1185
+ dergipark.org.tr | success | 774
+ dergipark.org.tr | no-pdf-link | 320
+ didaktorika.gr | | 688
+ didaktorika.gr | redirect-loop | 688
+ digi.ub.uni-heidelberg.de | | 292
+ digi.ub.uni-heidelberg.de | no-pdf-link | 292
+ direct.mit.edu | | 236
+ direct.mit.edu | no-pdf-link | 207 *
+ dl.acm.org | | 2319
+ dl.acm.org | blocked-cookie | 2230
+ dmtcs.episciences.org | | 733
+ dmtcs.episciences.org | success | 730
+ doi.ala.org.au | no-pdf-link | 2373 **
+ doi.ala.org.au | | 2373
+ doi.org | | 732
+ doi.org | terminal-bad-status | 673
+ downloads.hindawi.com | success | 1452
+ downloads.hindawi.com | | 1452
+ drive.google.com | | 216
+ drive.google.com | no-pdf-link | 211
+ dtb.bmj.com | | 674
+ dtb.bmj.com | link-loop | 669
+ easy.dans.knaw.nl | no-pdf-link | 261 *
+ easy.dans.knaw.nl | | 261
+ ebooks.marilia.unesp.br | | 688
+ ebooks.marilia.unesp.br | no-pdf-link | 688 *
+ ehp.niehs.nih.gov | | 766
+ ehp.niehs.nih.gov | blocked-cookie | 765
+ ejournal.mandalanursa.org | | 307
+ ejournal.mandalanursa.org | success | 305
+ elib.spbstu.ru | | 264
+ elib.spbstu.ru | redirect-loop | 257
+ elibrary.ru | | 1367
+ elibrary.ru | redirect-loop | 1169
+ elibrary.vdi-verlag.de | | 1251
+ elibrary.vdi-verlag.de | no-pdf-link | 646
+ elibrary.vdi-verlag.de | link-loop | 537
+ elifesciences.org | | 328
+ elifesciences.org | success | 323
+ figshare.com | | 803
+ figshare.com | no-pdf-link | 714 *
+ files.osf.io | | 745
+ files.osf.io | success | 614
+ hammer.purdue.edu | | 244
+ hammer.purdue.edu | no-pdf-link | 243
+ heiup.uni-heidelberg.de | | 277
+ heiup.uni-heidelberg.de | no-pdf-link | 268
+ hkvalidate.perfdrive.com | no-pdf-link | 370 *
+ hkvalidate.perfdrive.com | | 370
+ ieeexplore.ieee.org | | 16675
+ ieeexplore.ieee.org | spn2-error:job-failed | 12927
+ ieeexplore.ieee.org | success | 1952
+ ieeexplore.ieee.org | too-many-redirects | 1193
+ ieeexplore.ieee.org | no-pdf-link | 419
+ jamanetwork.com | | 339
+ jamanetwork.com | success | 216
+ jmstt.ntou.edu.tw | | 244
+ jmstt.ntou.edu.tw | success | 241
+ journal.ipb.ac.id | | 229
+ journal.ipb.ac.id | success | 206
+ journal.nafe.org | | 221
+ journals.aps.org | | 614
+ journals.aps.org | gateway-timeout | 495
+ journals.asm.org | | 463
+ journals.asm.org | blocked-cookie | 435
+ journals.flvc.org | | 230
+ journals.lww.com | | 1300
+ journals.lww.com | link-loop | 1284
+ journals.openedition.org | | 543
+ journals.openedition.org | success | 311
+ journals.ub.uni-heidelberg.de | | 357
+ journals.ub.uni-heidelberg.de | success | 311
+ jov.arvojournals.org | | 431
+ jov.arvojournals.org | no-pdf-link | 422 *
+ kiss.kstudy.com | | 303
+ kiss.kstudy.com | no-pdf-link | 303 *
+ library.iated.org | | 364
+ library.iated.org | redirect-loop | 264
+ library.seg.org | blocked-cookie | 301
+ library.seg.org | | 301
+ link.aps.org | redirect-loop | 442
+ link.aps.org | | 442
+ linkinghub.elsevier.com | | 515
+ linkinghub.elsevier.com | gateway-timeout | 392
+ mc.sbm.org.br | | 224
+ mc.sbm.org.br | success | 224
+ mdpi-res.com | | 742
+ mdpi-res.com | success | 742
+ mdsoar.org | | 220
+ mediarep.org | | 269
+ mediarep.org | success | 264
+ medrxiv.org | redirect-loop | 290
+ medrxiv.org | | 290
+ muse.jhu.edu | | 429
+ muse.jhu.edu | terminal-bad-status | 391
+ mvmj.journals.ekb.eg | | 306
+ oapub.org | | 292
+ oapub.org | success | 289
+ onepetro.org | | 426
+ onepetro.org | link-loop | 406
+ onlinelibrary.wiley.com | | 2835
+ onlinelibrary.wiley.com | blocked-cookie | 2531
+ onlinelibrary.wiley.com | redirect-loop | 264
+ open.library.ubc.ca | | 569
+ open.library.ubc.ca | no-pdf-link | 425 *
+ opendata.uni-halle.de | | 407
+ opendata.uni-halle.de | success | 263
+ osf.io | | 49022
+ osf.io | gateway-timeout | 29810
+ osf.io | terminal-bad-status | 18731
+ osf.io | spn2-error | 247
+ osf.io | not-found | 205
+ oxford.universitypressscholarship.com | | 392
+ oxford.universitypressscholarship.com | link-loop | 233
+ panor.ru | no-pdf-link | 433 *
+ panor.ru | | 433
+ papers.ssrn.com | | 1630
+ papers.ssrn.com | link-loop | 1598
+ pdf.sciencedirectassets.com | | 3063
+ pdf.sciencedirectassets.com | success | 3063
+ peerj.com | | 464
+ peerj.com | no-pdf-link | 303 *
+ periodicos.ufpe.br | | 245
+ periodicos.ufpe.br | success | 232
+ periodicos.unb.br | | 230
+ periodicos.unb.br | success | 221
+ preprints.jmir.org | | 548
+ preprints.jmir.org | cdx-error | 499
+ publications.rwth-aachen.de | | 213
+ publikationen.bibliothek.kit.edu | | 346
+ publikationen.bibliothek.kit.edu | success | 314
+ publikationen.uni-tuebingen.de | | 623
+ publikationen.uni-tuebingen.de | no-pdf-link | 522 *
+ publons.com | no-pdf-link | 934 *
+ publons.com | | 934
+ pubs.acs.org | | 4507
+ pubs.acs.org | blocked-cookie | 4406
+ pubs.rsc.org | | 1638
+ pubs.rsc.org | link-loop | 1054
+ pubs.rsc.org | redirect-loop | 343
+ pubs.rsc.org | success | 201
+ repositorio.ufu.br | | 637
+ repositorio.ufu.br | success | 607
+ repository.dri.ie | | 1852
+ repository.dri.ie | no-pdf-link | 1852 **
+ repository.library.brown.edu | | 293
+ repository.library.brown.edu | no-pdf-link | 291 *
+ res.mdpi.com | | 10367
+ res.mdpi.com | success | 10360
+ retrovirology.biomedcentral.com | | 230
+ revistas.ufrj.br | | 284
+ revistas.ufrj.br | success | 283
+ revistas.uptc.edu.co | | 385
+ revistas.uptc.edu.co | success | 344
+ royalsocietypublishing.org | | 231
+ rsdjournal.org | | 347
+ rsdjournal.org | success | 343
+ s3-ap-southeast-2.amazonaws.com | | 400
+ s3-ap-southeast-2.amazonaws.com | success | 392
+ s3-eu-west-1.amazonaws.com | | 2096
+ s3-eu-west-1.amazonaws.com | success | 2091
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286
+ s3.ca-central-1.amazonaws.com | | 202
+ sage.figshare.com | | 242
+ sage.figshare.com | no-pdf-link | 241
+ sajeb.org | | 246
+ sajeb.org | no-pdf-link | 243
+ scholar.dkyobobook.co.kr | | 332
+ scholar.dkyobobook.co.kr | no-pdf-link | 328 *
+ search.mandumah.com | | 735
+ search.mandumah.com | redirect-loop | 726
+ secure.jbs.elsevierhealth.com | | 1112
+ secure.jbs.elsevierhealth.com | blocked-cookie | 1108
+ stm.bookpi.org | no-pdf-link | 468 *
+ stm.bookpi.org | | 468
+ storage.googleapis.com | | 1012
+ storage.googleapis.com | success | 1012
+ tandf.figshare.com | | 469
+ tandf.figshare.com | no-pdf-link | 466
+ teses.usp.br | | 739
+ teses.usp.br | success | 730
+ tidsskrift.dk | | 360
+ tidsskrift.dk | success | 346
+ tiedejaedistys.journal.fi | | 224
+ tind-customer-agecon.s3.amazonaws.com | success | 332
+ tind-customer-agecon.s3.amazonaws.com | | 332
+ valep.vc.univie.ac.at | no-pdf-link | 280
+ valep.vc.univie.ac.at | | 280
+ watermark.silverchair.com | | 1729
+ watermark.silverchair.com | success | 1719
+ www.academia.edu | | 387
+ www.academia.edu | no-pdf-link | 386
+ www.ahajournals.org | | 430
+ www.ahajournals.org | blocked-cookie | 413
+ www.atenaeditora.com.br | | 572
+ www.atenaeditora.com.br | terminal-bad-status | 513
+ www.atlantis-press.com | success | 722
+ www.atlantis-press.com | | 722
+ www.aup-online.com | | 419
+ www.aup-online.com | no-pdf-link | 419 *
+ www.beck-elibrary.de | | 269
+ www.beck-elibrary.de | no-pdf-link | 268 *
+ www.biodiversitylibrary.org | no-pdf-link | 528 *
+ www.biodiversitylibrary.org | | 528
+ www.bloomsburycollections.com | | 623
+ www.bloomsburycollections.com | no-pdf-link | 605 *
+ www.cabi.org | | 2191
+ www.cabi.org | no-pdf-link | 2186 *
+ www.cairn.info | | 1283
+ www.cairn.info | no-pdf-link | 713
+ www.cairn.info | link-loop | 345
+ www.cambridge.org | | 4128
+ www.cambridge.org | no-pdf-link | 1531
+ www.cambridge.org | success | 1441
+ www.cambridge.org | link-loop | 971
+ www.cureus.com | no-pdf-link | 526 *
+ www.cureus.com | | 526
+ www.dbpia.co.kr | | 637
+ www.dbpia.co.kr | redirect-loop | 631
+ www.deboni.he.com.br | | 382
+ www.deboni.he.com.br | success | 381
+ www.degruyter.com | | 17783
+ www.degruyter.com | no-pdf-link | 15102
+ www.degruyter.com | success | 2584
+ www.dovepress.com | | 480
+ www.dovepress.com | success | 472
+ www.e-manuscripta.ch | | 1350
+ www.e-manuscripta.ch | no-pdf-link | 1350 *
+ www.e-periodica.ch | | 1276
+ www.e-periodica.ch | no-pdf-link | 1275
+ www.e-rara.ch | | 202
+ www.e-rara.ch | no-pdf-link | 202
+ www.elgaronline.com | | 495
+ www.elgaronline.com | link-loop | 290
+ www.elibrary.ru | | 922
+ www.elibrary.ru | no-pdf-link | 904
+ www.emerald.com | | 2155
+ www.emerald.com | no-pdf-link | 1936 *
+ www.emerald.com | success | 219
+ www.eurekaselect.com | | 518
+ www.eurekaselect.com | no-pdf-link | 516 *
+ www.frontiersin.org | | 4163
+ www.frontiersin.org | no-pdf-link | 4162 **
+ www.hanser-elibrary.com | | 444
+ www.hanser-elibrary.com | blocked-cookie | 444
+ www.hanspub.org | | 334
+ www.hanspub.org | no-pdf-link | 314
+ www.idunn.no | | 1736
+ www.idunn.no | link-loop | 596
+ www.idunn.no | success | 577
+ www.idunn.no | no-pdf-link | 539
+ www.igi-global.com | terminal-bad-status | 458
+ www.igi-global.com | | 458
+ www.ijcai.org | | 533
+ www.ijcai.org | success | 532
+ www.ijraset.com | success | 385
+ www.ijraset.com | | 385
+ www.inderscience.com | | 712
+ www.inderscience.com | no-pdf-link | 605 *
+ www.ingentaconnect.com | | 456
+ www.ingentaconnect.com | no-pdf-link | 413 *
+ www.internationaljournalssrg.org | | 305
+ www.internationaljournalssrg.org | no-pdf-link | 305 *
+ www.isca-speech.org | | 2392
+ www.isca-speech.org | no-pdf-link | 2391 **
+ www.journals.uchicago.edu | | 228
+ www.journals.uchicago.edu | blocked-cookie | 227
+ www.jstage.jst.go.jp | | 1492
+ www.jstage.jst.go.jp | success | 1185
+ www.jstage.jst.go.jp | no-pdf-link | 289
+ www.jstor.org | | 301
+ www.jurology.com | | 887
+ www.jurology.com | redirect-loop | 887
+ www.karger.com | | 318
+ www.liebertpub.com | | 507
+ www.liebertpub.com | blocked-cookie | 496
+ www.morressier.com | | 4781
+ www.morressier.com | no-pdf-link | 4655 **
+ www.ncl.ecu.edu | | 413
+ www.ncl.ecu.edu | success | 413
+ www.nomos-elibrary.de | | 526
+ www.nomos-elibrary.de | no-pdf-link | 391
+ www.oecd-ilibrary.org | no-pdf-link | 1170 **
+ www.oecd-ilibrary.org | | 1170
+ www.openagrar.de | no-pdf-link | 221
+ www.openagrar.de | | 221
+ www.osapublishing.org | | 900
+ www.osapublishing.org | link-loop | 615
+ www.osapublishing.org | no-pdf-link | 269
+ www.osti.gov | | 630
+ www.osti.gov | link-loop | 573
+ www.oxfordlawtrove.com | no-pdf-link | 476 *
+ www.oxfordlawtrove.com | | 476
+ www.pdcnet.org | | 298
+ www.pdcnet.org | terminal-bad-status | 262
+ www.pedocs.de | | 203
+ www.pnas.org | | 222
+ www.preprints.org | | 372
+ www.preprints.org | success | 366
+ www.repository.cam.ac.uk | | 801
+ www.repository.cam.ac.uk | success | 359
+ www.repository.cam.ac.uk | no-pdf-link | 239
+ www.research-collection.ethz.ch | | 276
+ www.research-collection.ethz.ch | terminal-bad-status | 274
+ www.revistas.usp.br | | 207
+ www.revistas.usp.br | success | 204
+ www.rina.org.uk | no-pdf-link | 1009 **
+ www.rina.org.uk | | 1009
+ www.schweizerbart.de | no-pdf-link | 202
+ www.schweizerbart.de | | 202
+ www.scielo.br | | 544
+ www.scielo.br | redirect-loop | 526
+ www.sciencedirect.com | | 3901
+ www.sciencedirect.com | no-pdf-link | 3127 **
+ www.sciencedirect.com | link-loop | 701
+ www.sciendo.com | | 384
+ www.sciendo.com | success | 363
+ www.sciengine.com | | 225
+ www.scirp.org | | 209
+ www.spandidos-publications.com | | 205
+ www.tandfonline.com | | 8925
+ www.tandfonline.com | blocked-cookie | 8099
+ www.tandfonline.com | terminal-bad-status | 477
+ www.tandfonline.com | redirect-loop | 322
+ www.taylorfrancis.com | | 6119
+ www.taylorfrancis.com | no-pdf-link | 3567
+ www.taylorfrancis.com | link-loop | 2169
+ www.taylorfrancis.com | terminal-bad-status | 353
+ www.thieme-connect.de | | 1047
+ www.thieme-connect.de | redirect-loop | 472
+ www.thieme-connect.de | spn2-error:job-failed | 343
+ www.tib.eu | | 206
+ www.trp.org.in | | 311
+ www.trp.org.in | success | 311
+ www.un-ilibrary.org | no-pdf-link | 597 *
+ www.un-ilibrary.org | | 597
+ www.vr-elibrary.de | | 775
+ www.vr-elibrary.de | blocked-cookie | 774
+ www.wjgnet.com | | 204
+ www.wjgnet.com | no-pdf-link | 204
+ www.worldscientific.com | | 974
+ www.worldscientific.com | blocked-cookie | 971
+ www.worldwidejournals.com | | 242
+ www.worldwidejournals.com | no-pdf-link | 203
+ www.wto-ilibrary.org | no-pdf-link | 295
+ www.wto-ilibrary.org | | 295
+ www.zora.uzh.ch | | 222
+ zenodo.org | | 49460
+ zenodo.org | no-pdf-link | 39721
+ zenodo.org | success | 8954
+ zenodo.org | wrong-mimetype | 562
+ | | 445919
+ | no-pdf-link | 168035
+ | success | 140875
+ | gateway-timeout | 31809
+ | blocked-cookie | 26431
+ | terminal-bad-status | 25625
+ | link-loop | 19006
+ | spn2-error:job-failed | 13962
+ | redirect-loop | 12512
+ | wrong-mimetype | 2302
+ | spn2-error | 1689
+ | too-many-redirects | 1203
+ | bad-redirect | 732
+ | cdx-error | 539
+ | not-found | 420
+ | spn2-error:no-status | 256
+ (419 rows)
+
+Get random subsets by terminal domain:
+
+ \x auto
+ SELECT
+ ingest_request.link_source_id AS link_source_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%'
+ ORDER BY random()
+ LIMIT 5;
+
+## acervus.unicamp.br
+
+Previously flagged as messy (2021-05_daily_improvements.md)
+
+## cas.columbia.edu
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-2ety-qm51
+base_url | https://doi.org/10.7916/d8-2ety-qm51
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-0zf6-d167
+base_url | https://doi.org/10.7916/d8-0zf6-d167
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-k6ha-sn43
+base_url | https://doi.org/10.7916/d8-k6ha-sn43
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-bj6t-eb07
+base_url | https://doi.org/10.7916/d8-bj6t-eb07
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-xjac-j502
+base_url | https://doi.org/10.7916/d8-xjac-j502
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+
+these are not public (loginwalls)
+
+DONE: '/login?TARGET=' as a login wall pattern
+
+## doi.ala.org.au
+
+Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md)
+
+NOTE: look at ingesting datasets
+
+## www.isca-speech.org
+
+-[ RECORD 1 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2014-84
+base_url | https://doi.org/10.21437/interspeech.2014-84
+terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html
+-[ RECORD 2 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2004-319
+base_url | https://doi.org/10.21437/interspeech.2004-319
+terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html
+-[ RECORD 3 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-372
+base_url | https://doi.org/10.21437/interspeech.2006-372
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html
+-[ RECORD 4 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2015-588
+base_url | https://doi.org/10.21437/interspeech.2015-588
+terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html
+-[ RECORD 5 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-468
+base_url | https://doi.org/10.21437/interspeech.2006-468
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html
+
+Bespoke site. Added rule to sandcrawler.
+
+NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?)
+
+## www.morressier.com
+
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0002858v
+base_url | https://doi.org/10.1115/1.0002858v
+terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0003896v
+base_url | https://doi.org/10.1115/1.0003896v
+terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0004476v
+base_url | https://doi.org/10.1115/1.0004476v
+terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0001286v
+base_url | https://doi.org/10.1115/1.0001286v
+terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0000315v
+base_url | https://doi.org/10.1115/1.0000315v
+terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874
+
+Many of these seem to be presentations, as both video and slides. PDFs seem broken though.
+
+NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data)
+
+## www.oecd-ilibrary.org
+
+Paywall (2021-05_daily_improvements.md)
+
+## www.rina.org.uk
+
+-[ RECORD 1 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.ws.2002.10
+base_url | https://doi.org/10.3940/rina.ws.2002.10
+terminal_url | https://www.rina.org.uk/showproducts.html?product=4116
+-[ RECORD 2 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.pass.2003.16
+base_url | https://doi.org/10.3940/rina.pass.2003.16
+terminal_url | https://www.rina.org.uk/showproducts.html?product=3566
+-[ RECORD 3 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin.2013.15
+base_url | https://doi.org/10.3940/rina.icsotin.2013.15
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8017
+-[ RECORD 4 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.wfa.2010.23
+base_url | https://doi.org/10.3940/rina.wfa.2010.23
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8177
+-[ RECORD 5 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin15.2015.01
+base_url | https://doi.org/10.3940/rina.icsotin15.2015.01
+terminal_url | https://www.rina.org.uk/showproducts.html?product=7883
+
+Site is broken in some way
+
+## www.sciencedirect.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.jhlste.2021.100332
+base_url | https://doi.org/10.1016/j.jhlste.2021.100332
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.hazadv.2021.100006
+base_url | https://doi.org/10.1016/j.hazadv.2021.100006
+terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-12-822844-9.00009-8
+base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.colcom.2021.100490
+base_url | https://doi.org/10.1016/j.colcom.2021.100490
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-323-85245-6.00012-6
+base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126
+
+These no-pdf-url ones seem to just be not OA, which is expected for much of the
+domain.
+
+## repository.dri.ie
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+---------------------------------------------
+ 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941
+ 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f
+ 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102
+ 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t
+ 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726
+
+"Digital repository of Ireland"
+
+Historical scanned content. Bespoke site. Fixed.
+
+NOTE: recrawl/retry this domain
+
+## www.frontiersin.org
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/978-2-88971-147-5
+base_url | https://doi.org/10.3389/978-2-88971-147-5
+terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fnins.2021.722592
+base_url | https://doi.org/10.3389/fnins.2021.722592
+terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fcell.2021.683209
+base_url | https://doi.org/10.3389/fcell.2021.683209
+terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fmicb.2021.692474
+base_url | https://doi.org/10.3389/fmicb.2021.692474
+terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fneur.2021.676527
+base_url | https://doi.org/10.3389/fneur.2021.676527
+terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full
+
+All the `/research-topics/` URLs are out of scope.
+
+NOTE: recrawl missing frontiersin.org articles for PDFs
+NOTE: recrawl missing frontiersin.org articles for XML (?)
+
+-------
+
+## direct.mit.edu
+
+Previously "not available" (2021-05_daily_improvements.md)
+
+## figshare.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15052236.v6
+base_url | https://doi.org/10.6084/m9.figshare.15052236.v6
+terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.14907846.v5
+base_url | https://doi.org/10.6084/m9.figshare.14907846.v5
+terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15157614.v1
+base_url | https://doi.org/10.6084/m9.figshare.15157614.v1
+terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15172926.v1
+base_url | https://doi.org/10.6084/m9.figshare.15172926.v1
+terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.16532574.v1
+base_url | https://doi.org/10.6084/m9.figshare.16532574.v1
+terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1
+
+NOTE: can determine from the redirect URL, I guess. This is helpful for ingest!
+Could also potentially correct fatcat release_type using this info.
+
+We seem to be getting the ones we can (eg, papers) just fine
+
+## hkvalidate.perfdrive.com
+
+Should be skipping/bailing on this domain, but not for some reason.
+
+-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05cc
+base_url | https://doi.org/10.3847/1538-4357/ac05cc
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 2 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac0429
+base_url | https://doi.org/10.3847/1538-4357/ac0429
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 3 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1149/1945-7111/ac1a85
+base_url | https://doi.org/10.1149/1945-7111/ac1a85
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 4 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.35848/1882-0786/ac1b0d
+base_url | https://doi.org/10.35848/1882-0786/ac1b0d
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 5 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05ba
+base_url | https://doi.org/10.3847/1538-4357/ac05ba
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+
+Was failing to check against blocklist again at the end of attempts.
+
+Could retry all these to update status, but probably not worth it.
+
+## jov.arvojournals.org
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+-------------------------------------------------------------
+ 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021
+ 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561
+ 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057
+ 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793
+ 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441
+
+These seem to just not be published/available yet.
+
+But they also use watermark.silverchair.com
+
+NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest
+NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix
+
+## kiss.kstudy.com
+
+Previously unable to download (2021-05_daily_improvements.md)
+
+## open.library.ubc.ca
+
+ link_source_id | base_url | terminal_url
+--------------------+------------------------------------+----------------------------------------------------------------------------------
+ 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664
+ 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189
+ 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+ 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994
+ 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312
+
+Historical newspapers, out of scope?
+
+Video content:
+https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+
+Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+
+NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+
+
+## panor.ru
+
+ link_source_id | base_url | terminal_url
+-------------------------+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html
+ 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html
+ 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html
+ 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html
+ 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html
+
+"The full version of the article is available only to subscribers of the journal"
+
+Paywall
+
+## peerj.com
+
+Previously: this is HTML of reviews (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope
+
+## publons.com
+
+Previously: this is HTML (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope (length of works)
+
+## stm.bookpi.org
+
+ link_source_id | base_url | terminal_url
+-----------------------------+---------------------------------------------+----------------------------------------------------
+ 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231
+ 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096
+ 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330
+ 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810
+ 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274
+
+These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref
+
+## www.cabi.org
+
+ link_source_id | base_url | terminal_url
+--------------------------+------------------------------------------+----------------------------------------------------
+ 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742
+ 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471
+ 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544
+ 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117
+ 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337
+
+Reviews? but just abstracts?
+
+## www.cureus.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17547
+base_url | https://doi.org/10.7759/cureus.17547
+terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16867
+base_url | https://doi.org/10.7759/cureus.16867
+terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17425
+base_url | https://doi.org/10.7759/cureus.17425
+terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17313
+base_url | https://doi.org/10.7759/cureus.17313
+terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16943
+base_url | https://doi.org/10.7759/cureus.16943
+terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed
+
+Ugh, stupid "email to get PDF". but ingest seems to work anyways?
+
+NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar)
+
+## www.e-manuscripta.ch
+
+ link_source_id | base_url | terminal_url
+------------------------------+----------------------------------------------+-------------------------------------------------------------------
+ 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031
+ 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064
+ 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176
+ 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200
+ 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008
+
+Historical docs, single pages, but do have full PDF downloads.
+
+NOTE: re-ingest
+
+## www.inderscience.com
+
+Previously: paywall (2021-05_daily_improvements.md)
+
+## www.un-ilibrary.org
+
+ link_source_id | base_url | terminal_url
+----------------------------+--------------------------------------------+-------------------------------------------------------------
+ 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307
+ 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011
+ 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014
+ 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020
+ 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005
+
+Books and chapters. Doesn't seem to have actual download ability?
+
+# Re-Ingest / Re-Crawl
+
+Using fatcat-ingest helper tool.
+
+- www.isca-speech.org doi_prefix:10.21437
+ doi:* doi_prefix:10.21437 in_ia:false
+ 9,233
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json
+ => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221})
+- repository.dri.ie doi_prefix:10.7486
+ doi:* in_ia:false doi_prefix:10.7486
+ 56,532
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json
+ => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532})
+- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link)
+ 25,598
+ many are meeting abstracts
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json
+ => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598})
+- www.cureus.com doi_prefix:10.7759
+ 1,537
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json
+ => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535})
+- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta
+ 110,945
+ TODO: all are marked 'unpublished', but that is actually probably right?
+- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!)
+ doi:* in_ia:false doi_prefix:10.3389
+ 212,370
+ doi:10.3389/conf.* => most seem to be just abstracts? how many like this?
+ container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k)
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 191k
+ but many might be components? this is actually kind of a mess
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 19.2k
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json
+
+# Remaining Tasks / Domains (TODO)
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/ingest/NEXT.md b/notes/ingest/NEXT.md
new file mode 100644
index 0000000..8cdd6df
--- /dev/null
+++ b/notes/ingest/NEXT.md
@@ -0,0 +1,52 @@
+
+biorxiv
+medrxiv
+ doi:10.1101\/20*
+
+persee.fr 147k
+ publisher:persee in_ia:false is_oa:true
+ https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013
+
+cairn.info: 161k
+ doi_prefix:10.3917 in_ia:false is_oa:true
+ https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm
+ https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm
+
+IOP OA: 169k
+ doi_prefix:10.1088 is_oa:true in_ia:false
+
+indian journals platform? 124k
+ doi_prefix:10.4103 in_ia:false is_oa:true
+ http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad
+ http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki
+
+openedition? 48k
+ doi_prefix:10.4000 is_oa:true in_ia:false
+
+german medical science (GMS) 28k
+ doi_prefix:10.3205 in_ia:false is_oa:true
+ https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml
+
+siberian chemistry 28k
+ doi_prefix:10.2298 in_ia:false is_oa:true
+ http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H
+
+jalc oa doi: 82k
+ doi_registrar:jalc in_ia:false is_oa:true
+
+sage OA papers
+ https://journals.sagepub.com/doi/10.1177/034003529802400510
+
+Scientific Reports: 25k
+ in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm"
+
+U Toronto press: 23k
+ publisher:"Toronto Press" in_ia:false is_oa:true
+ has an annoying bounce page
+
+ASHA (speech-language-hearing association): 7k
+ publisher:Speech-Language-Hearing in_ia:false is_oa:true
+
+MIT press journals
+
+
diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py
new file mode 100755
index 0000000..4cd1811
--- /dev/null
+++ b/notes/ingest/es_csv_to_json.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+"""
+ input like:
+
+ doi,ident,"release_stage"
+ "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published
+ "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published
+ "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published
+ "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published
+ "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published
+
+ output like:
+
+ {
+ "base_url": "https://doi.org/10.7554/elife.38904",
+ "ext_ids": {
+ "doi": "10.7554/elife.38904"
+ },
+ "fatcat_release": "mxj534diw5gatc26rkif3io5xm",
+ "release_stage": "published"
+ }
+"""
+
+import csv, sys, json
+
+reader = csv.DictReader(sys.stdin)
+for row in reader:
+ d = {
+ "base_url": "https://doi.org/{}".format(row['doi']),
+ "ext_ids": {
+ "doi": row['doi'],
+ },
+ "fatcat_release": row['ident'],
+ "release_stage": row['release_stage'],
+ }
+ print(json.dumps(d))
diff --git a/notes/library_shopping.txt b/notes/library_shopping.txt
new file mode 100644
index 0000000..bf876a5
--- /dev/null
+++ b/notes/library_shopping.txt
@@ -0,0 +1,10 @@
+
+potential helpers:
+- https://github.com/martinblech/xmltodict
+- https://github.com/trananhkma/fucking-awesome-python#text-processing
+- https://github.com/blaze/blaze (for catalog/analytics)
+- validation: https://github.com/pyeve/cerberus
+- testing (to replace nose):
+ - https://github.com/CleanCut/green
+ - pytest
+ - mamba ("behavior driven")
diff --git a/notes/match_filter_enrich.txt b/notes/match_filter_enrich.txt
new file mode 100644
index 0000000..0c1f7df
--- /dev/null
+++ b/notes/match_filter_enrich.txt
@@ -0,0 +1,31 @@
+
+This could all be a single scalding job eventually.
+
+First, run matchcrossref and dumpfilemeta, and copy the output down to an SSD
+somewhere.
+
+ bnewbold@ia601101$ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | wc -l
+ 30728100
+
+Reduce down the scored matches to just {sha1, dois}, sorted:
+
+ zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort -S 8G > 2018-08-27-2352.17-matchcrossref.filtered.tsv
+ # 5.79M 0:18:54 [5.11k/s]
+
+Join/merge the output:
+
+ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | LC_ALL=C join -t$'\t' 2018-08-27-2352.17-matchcrossref.filtered.tsv - | pv -l | ./enrich_scored_matches.py | gzip > 2018-08-27-2352.17-matchcrossref.insertable.json.gz
+ # 5.79M 0:09:09 [10.5k/s]
+
+## Fatcat Insertable
+
+I can't remember now what the plan was for the 'insertable' output mode, which
+bundles {key, cdx, mime, and size} info along with the {slug, score, json1,
+json2} columns from the regular match script. The filter_scored_matches.py
+doesn't know what to do with those columns at the moment, and the output isn't
+sorted by slug... need to tweak scripts to fix this.
+
+In the meanwhile, as a work around just take the columns we want and re-sort:
+
+ export LC_ALL=C
+ zcat 2018-12-18-2237.09-matchcrossref.insertable.tsv.gz | cut -f2-5 | sort -S 8G -u | gzip > 2018-12-18-2237.09-matchcrossref.tsv.gz
diff --git a/notes/old_extract_results.txt b/notes/old_extract_results.txt
new file mode 100644
index 0000000..0327b8b
--- /dev/null
+++ b/notes/old_extract_results.txt
@@ -0,0 +1,50 @@
+
+command:
+
+ ./extraction_cdx_grobid.py --hbase-table wbgrp-journal-extract-0-qa --hbase-host bnewbold-dev.us.archive.org --grobid-uri http://wbgrp-svc096.us.archive.org:8070 -r hadoop -c mrjob.conf --archive $VENVSHORT.tar.gz#venv hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx --jobconf mapred.line.input.format.linespermap=8000 --jobconf mapreduce.job.queuename=extraction
+
+Started: Wed Apr 11 05:54:54 UTC 2018
+Finished: Sun Apr 15 20:42:37 UTC 2018
+(late saturday night PST fixed grobid parallelism)
+
+Elapsed: 110hrs, 47mins, 42sec
+
+line counts:
+ error 3896
+ existing 311209
+ invalid 2311343
+ skip 195641
+ success 1143094
+ total 3,965,183
+
+## Against prod table
+
+Started: Sun Apr 15 21:38:24 UTC 2018
+Finished: Wed Apr 18 17:36:44 UTC 2018
+Elapsed: 67hrs, 58mins, 20sec
+
+lines
+ error 143
+ existing 213292
+ invalid 2311343
+ skip 195641
+ success 1,244,764
+ total 3,965,183
+
+## TARGETED
+
+Job job_1513499322977_358533 failed with state FAILED due to: Task failed task_1513499322977_358533_m_000323
+
+Started: Thu Apr 19 05:21:25 UTC 2018
+Finished: Sat Apr 21 11:01:58 UTC 2018
+Elapsed: 53hrs, 40mins, 33sec
+
+lines
+ error=4093
+ existing=55448
+ invalid=688873
+ skip=257533
+ success=1,282,053
+ total=2,288,000
+
+
diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt
new file mode 100644
index 0000000..f46ea61
--- /dev/null
+++ b/notes/petabox_ia_metadata.txt
@@ -0,0 +1,56 @@
+
+Ran in aitio:/schnell/iamine-journals in December 2018.
+
+Output uploaded to https://archive.org/details/ia-petabox-journal-metadata-2018
+
+Commands:
+
+ # didn't work!
+ #ia-mine --search collection:journals --itemlist > journals.20181218.itemlist
+
+ # fetched manually via metamgr, using prefix matches
+ cat metamgr-* > metamgr-journals-loose.20181218.items
+
+ ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json
+
+ export LC_ALL=C
+ cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -S 4G -u > journals-ia.20181218.pdf-sha1.tsv
+
+Size/results:
+
+ bnewbold@ia601101$ wc -l journals-ia.20181218.json metamgr-journals-loose.20181218.items
+ 2043877 journals-ia.20181218.json
+ 2044362 metamgr-journals-loose.20181218.items
+
+ # missed about 500; meh
+
+ -rw-rw-r-- 1 bnewbold bnewbold 9.5G Dec 19 23:26 journals-ia.20181218.json
+
+ bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv
+ 1748645 journals-ia.20181218.pdf-sha1.tsv
+
+## June 2019 Ingest
+
+ bnewbold@ia601101$ pwd
+ /schnell/iamine-journals
+
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json
+
+ cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json
+ cat jstor.json | ./ia_pdf_match.py > jstor.match.json
+ cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json
+ cat pmc.json | ./ia_pdf_match.py > pmc.match.json
+
+ bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json
+ 1076012 arxiv.json
+ 740970 arxiv.match.json
+ 451204 jstor.json
+ 451204 jstor.match.json
+ 77838 paper-doi.json
+ 23736 paper-doi.match.json
+ 209787 pmc.json
+ 189093 pmc.match.json
+
diff --git a/notes/tasks/2020-01-06_heuristic_cdx.txt b/notes/tasks/2020-01-06_heuristic_cdx.txt
new file mode 100644
index 0000000..209fa4f
--- /dev/null
+++ b/notes/tasks/2020-01-06_heuristic_cdx.txt
@@ -0,0 +1,37 @@
+
+Wanted to include a large number of additional CDX lines based on regex
+pattern. These are primarily .edu domains with things that look like user
+accounts *and* .pdf file extensions in the path.
+
+## Commands
+
+aitio:/fast/gwb_pdfs
+
+ pdfs/gwb-pdf-20191005172329-url-heuristics-edu
+ pdfs/gwb-pdf-20191005172329-url-heuristics
+
+
+to filter as url/sha1 uniq:
+
+ cat raw.cdx | sort -u -t' ' -k3,6 -S 4G > uniq.cdx
+
+ cat gwb-pdf-20191005172329-url-heuristics-edu/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
+ cat gwb-pdf-20191005172329-url-heuristics/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
+
+ 7241795 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
+ 41137888 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
+
+ cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx | sort -u -S 4G | wc -l
+ 7241795
+
+ cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx | sort -u -S 4G | wc -l
+ 41137888
+
+ ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
+ Worker: Counter({'total': 7239153, 'insert-cdx': 6845283, 'update-cdx': 0})
+ CDX lines pushed: Counter({'total': 7241795, 'pushed': 7239153, 'skip-parse': 2603, 'skip-mimetype': 39})
+
+ ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
+ Worker: Counter({'total': 41030360, 'insert-cdx': 22430064, 'update-cdx': 0})
+ CDX lines pushed: Counter({'total': 41137888, 'pushed': 41030360, 'skip-mimetype': 87341, 'skip-parse': 20187})
+
diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md
new file mode 100644
index 0000000..54db92e
--- /dev/null
+++ b/notes/tasks/2020-01-27_cleanup_cdx.md
@@ -0,0 +1,34 @@
+
+Accidentally seem to have backfilled many CDX lines with non-PDF content.
+Should clear these out!
+
+Something like:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+Or maybe instead:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+SQL:
+
+ SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5;
+ SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01';
+ => 24841846
+
+ SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5;
+ SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL;
+ => 24547552
+
+ DELETE FROM cdx
+ WHERE sha1hex IN
+ (SELECT cdx.sha1hex
+ FROM cdx
+ LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex
+ WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL);
+ => DELETE 24553428
+
+Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in
+the DELETE WHERE clause.
diff --git a/notes/tasks/2020-01-27_grobid_backfill.md b/notes/tasks/2020-01-27_grobid_backfill.md
new file mode 100644
index 0000000..d70e203
--- /dev/null
+++ b/notes/tasks/2020-01-27_grobid_backfill.md
@@ -0,0 +1,40 @@
+
+Recently added a bunch of PDFs to sandcrawler-db. Want to GROBID extract the
+~15m which haven't been processed yet. Also want to re-GROBID a batch of
+PDFs-in-zipfiles from archive.org; will probably also want to re-GROBID other
+petabox files soon.
+
+## pre-1923 zipfile re-extraction
+
+Exact commands (in parallel):
+
+ fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
+ parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+
+ fd .zip /srv/sandcrawler/tasks/crossref-pre-1923-scholarly-works/ | \
+ parallel -j16 --progress --joblog extract_tasks_1923.log --resume-failed \
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+
+## petabox re-extraction
+
+This was run around 2020-02-03. There are a few million remaining PDFs that
+have only partial file metadata (`file_meta`), meaning run with old version of
+sandcrawler code. Want to get them all covered, maybe even DELETE the missing
+ones, so re-grobiding petabox-only files.
+
+There are about 2,887,834 files in petabox, only 46,232 need re-processing (!).
+
+ psql sandcrawler < dump_regrobid_pdf_petabox.sql
+ cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json
+
+This is pretty few... maybe even would have been caught by wayback backfill?
+
+Small start:
+
+ head /srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.uniq.json | ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+Full batch, 25x parallel:
+
+ cat /srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.uniq.json | pv -l | parallel -j25 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
diff --git a/notes/tasks/2020-02-14_pdftrio.md b/notes/tasks/2020-02-14_pdftrio.md
new file mode 100644
index 0000000..e6f8d8e
--- /dev/null
+++ b/notes/tasks/2020-02-14_pdftrio.md
@@ -0,0 +1,162 @@
+
+First end-to-end `pdf_trio` results!
+
+## Source
+
+Will use AIT partner #1830 (U Alberta) CDX as input. These are unique by
+digest, about 100k.
+
+ ArchiveIt-Collection-1830.download.cdx
+
+## Testing/Prep
+
+Versions/setup:
+
+ sandcrawler: f613f69a40fcc9a445f21cadd35d7c36c8061db8
+ => patched to 'auto' mode
+
+ pdf_trio: 03bd3fdc15418462b2b1582e4f967f26ddcb43e2
+
+ pdftrio: 'auto' mode
+
+ uwsgi: 16x processes
+
+ sudo docker run --rm -p 8501:8501 -e TF_XLA_FLAGS=--tf_xla_cpu_global_jit -e KMP_AFFINITY=granularity=fine,compact,1,0 -e KMP_BLOCKTIME=0 -e OMP_NUM_THREADS=24 -e TENSORFLOW_INTER_OP_PARALLELISM=1 -e TENSORFLOW_INTRA_OP_PARALLELISM=24 -v /srv/pdftrio//models/bert_models:/models/bert_model -v /srv/pdftrio//models/pdf_image_classifier_model:/models/image_model -v /srv/pdftrio//config/tfserving_models_docker.config:/models/tfserving_models.config -v /srv/pdftrio/config/tfserving_batch.config:/models/tfserving_batch.config --name pdftrio-tfserving tensorflow/serving --model_config_file=/models/tfserving_models.config --enable_batching=true --batching_parameters_file=/models/tfserving_batch.config
+
+Basic testing::
+
+ head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - | jq .
+
+ head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+ => Running in kafka output mode, publishing to sandcrawler-qa.pdftrio-output
+
+
+On the persist side:
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-qa.pdftrio-output | head | jq .
+ => looks fine
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env qa persist-pdftrio
+ => Consuming from kafka topic sandcrawler-qa.pdftrio-output, group persist-pdftrio
+
+Ah, don't forget, start persist before writing to topic! Or would need to reset
+offsets to start.
+
+Seems to be only a single pdftext instance running? Very low CPU
+
+ head -n500 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+That is much better! CPU still not pegged, so maybe could do 50x processes? Lots of I/O wait. Blech.
+
+Zero ("0") not getting persisted for any columns (fixed in sandcrawler/db.py)
+
+`models_date` not getting set. Added `PDFTRIO_MODELS_DATE="2020-01-01"` to env. (TODO: ansible)
+
+## Prod Run
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env prod persist-pdftrio
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+Worker CPU basically blocked on pdftotext, multiple 100% CPU. Presumably I/O
+wait? Though not totally sure.
+
+htop:
+
+ PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command
+ 17951 pdftrio 20 0 51756 12868 5856 R 90.1 0.0 0:06.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 17870 pdftrio 20 0 52004 12964 5684 R 87.4 0.0 0:08.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 13735 root 20 0 10.4G 3815M 4144 S 79.6 7.6 48h02:37 tensorflow_model_server --port=8500 --rest_api_port=850
+ 14522 pdftrio 20 0 2817M 1331M 16896 R 43.1 2.6 0:57.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 18027 pdftrio 20 0 49192 10692 6116 R 39.8 0.0 0:00.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 14518 pdftrio 20 0 2818M 1336M 16836 S 33.3 2.7 0:47.46 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14504 pdftrio 20 0 2731M 1310M 13164 D 32.6 2.6 0:34.81 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14526 pdftrio 20 0 2816M 1333M 16832 R 28.7 2.7 0:57.22 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14500 pdftrio 20 0 2729M 1306M 13160 R 20.9 2.6 0:22.57 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14492 pdftrio 20 0 2729M 1307M 13156 S 17.6 2.6 0:17.91 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14508 pdftrio 20 0 2734M 1312M 14380 D 14.4 2.6 0:38.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14496 pdftrio 20 0 2728M 1300M 13160 S 13.7 2.6 0:18.00 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 17314 sandcrawl 20 0 56668 18228 4304 D 13.7 0.0 0:02.31 perl /usr/bin/parallel -j40 -N1 --pipe --round-robin --
+ 14472 pdftrio 20 0 2725M 1283M 13136 S 12.4 2.6 0:05.69 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14513 pdftrio 20 0 2730M 1309M 14300 S 11.1 2.6 0:40.32 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14480 pdftrio 20 0 2725M 1291M 13144 S 10.4 2.6 0:08.77 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14488 pdftrio 20 0 2725M 1294M 13152 S 9.8 2.6 0:08.18 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14468 pdftrio 20 0 2717M 1271M 13088 S 6.5 2.5 0:02.42 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 17411 sandcrawl 20 0 556M 53840 14936 S 6.5 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 14530 pdftrio 20 0 2524M 1252M 3492 S 4.6 2.5 0:12.72 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 7311 bnewbold 20 0 27716 5520 3128 R 3.9 0.0 0:41.59 htop
+ 17444 sandcrawl 20 0 552M 50456 14892 S 3.9 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 18042 pdftrio 20 0 46068 6588 5328 R 3.3 0.0 0:00.05 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 18043 pdftrio 20 0 4 4 0 R 2.6 0.0 0:00.04
+ 2203 grobid 20 0 6334M 126M 4188 S 0.7 0.3 3h27:32 /usr/lib/jvm/java-8-openjdk-amd64/bin/java -XX:MaxMetas
+ 17419 sandcrawl 20 0 619M 116M 15248 S 0.7 0.2 0:02.68 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17440 sandcrawl 20 0 578M 76948 15160 S 0.7 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 13848 root 20 0 0 0 0 D 0.7 0.0 0:00.69 kworker/u60:1
+ 17443 sandcrawl 20 0 578M 76500 14912 S 0.7 0.1 0:01.74 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17414 sandcrawl 20 0 580M 77720 15036 S 0.0 0.2 0:01.77 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17432 sandcrawl 20 0 563M 61460 14976 S 0.0 0.1 0:01.59 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17442 sandcrawl 20 0 561M 53096 15240 S 0.0 0.1 0:01.47 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17433 sandcrawl 20 0 559M 57160 15176 S 0.0 0.1 0:01.52 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17431 sandcrawl 20 0 554M 50960 14892 S 0.0 0.1 0:01.37 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17413 sandcrawl 20 0 554M 52376 14920 S 0.0 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+
+dstat:
+
+ ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+ usr sys idl wai hiq siq| read writ| recv send| in out | int csw
+ 32 9 22 37 0 0| 0 37M| 20M 12M| 0 0 | 35k 64k
+ 20 6 24 50 0 0| 0 20M| 30M 5662k| 0 0 | 27k 48k
+ 27 7 24 43 0 0| 0 26M|8712k 6289k| 0 0 | 21k 114k
+ 30 8 23 38 0 0|4096B 61M| 17M 20M| 0 0 | 31k 54k
+ 33 6 17 44 0 0| 0 32M| 14M 6384k| 0 0 | 27k 46k
+ 25 6 24 44 0 0| 0 19M| 18M 13M| 0 0 | 27k 179k
+ 40 6 19 35 0 0|8192B 25M|7855k 6661k| 0 0 | 31k 85k
+ 59 8 12 20 0 0| 0 39M|4177k 33M| 0 0 | 34k 64k
+ 34 4 17 44 0 0| 0 16M|7527k 11M| 0 0 | 22k 45k
+ 44 7 17 32 0 0| 0 30M| 20M 291k| 0 0 | 36k 62k
+
+Create tmpfs:
+
+ sudo mkdir -p /pdftrio-ramdisk
+ #sudo mount -t tmpfs -o size=2g tmpfs /pdftrio-ramdisk
+ sudo mount -t tmpfs -o size=6g tmpfs /pdftrio-ramdisk
+
+add to pdftrio config env and restart:
+
+ TEMP=/run/pdf_trio
+
+Seems to have worked. Pretty much maxed CPU, may need to back-off parallelism. Doing more than 31/sec.
+
+Errors were not getting encoded correctly:
+
+ File "/fast/sandcrawler/python/sandcrawler/persist.py", line 331, in push_batch
+ r['pdf_trio']['key'] = r['key']
+ KeyError: 'pdf_trio'
+
+Fixed in sandcrawler worker, and patched persist to work around this.
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+Wow, 30x parallelism waaaay less?
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+What changed? Confused. Load average was like 40.
+
+Via kafka, as much as 69.71/sec! Errors?
+
+Hrm, this whole `auto` thing. I am very skeptical. Should also do a run as `all`, -j20.
+
+ Worker: Counter({'total': 1916, 'pushed': 1916})
+ CDX lines pushed: Counter({'total': 1934, 'pushed': 1916, 'skip-parse': 18})
+
+Hit some bugs, causing failure, but still seem to have processed a good chunk.
+
+Switched to `all`, running a different batch:
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1914.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+After flag change, another batch in `all`:
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-2566.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md
new file mode 100644
index 0000000..70e2b59
--- /dev/null
+++ b/notes/tasks/2020-07-22_processing_holes.md
@@ -0,0 +1,120 @@
+
+Want to clean up missing/partial processing (GROBID, `pdf_meta`, `file_meta`)
+in sandcrawler database.
+
+
+## `pdf_meta` for petabox rows
+
+Ran `dump_unextracted_pdf_petabox.sql` SQL, which resulted in a .json file.
+
+ wc -l dump_unextracted_pdf_petabox.2020-07-22.json
+ 1503086 dump_unextracted_pdf_petabox.2020-07-22.json
+
+Great, 1.5 million, not too many. Start small:
+
+ head -n1000 dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Full batch:
+
+ cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_unextracted_pdf_petabox.2020-08-19.json
+ 971194 dump_unextracted_pdf_petabox.2020-08-19.json
+
+## `pdf_meta` missing CDX rows
+
+First, the GROBID-ized rows but only if has a fatcat file as well.
+
+10,755,365! That is a lot still to process.
+
+ cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_unextracted_pdf.fatcat.2020-08-19.json
+ 65517 dump_unextracted_pdf.fatcat.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` missing petabox rows
+
+ wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+ 972221 /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+
+Start small:
+
+ head -n1000 dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Full batch:
+
+ cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_ungrobided_pdf_petabox.2020-08-19.json
+ 933 dump_ungrobided_pdf_petabox.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` for missing CDX rows in fatcat
+
+ wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json
+ 1808580 dump_ungrobided_pdf.fatcat.2020-07-22.json
+
+Full batch:
+
+ cat dump_ungrobided_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## `GROBID` for bad status
+
+Eg, wayback errors.
+
+TODO
+
+## `pdf_trio` for OA journal crawls
+
+TODO
+
+## `pdf_trio` for "included by heuristic", not in fatcat
+
+TODO
+
+## Live-ingest missing arxiv papers
+
+ ./fatcat_ingest.py --allow-non-oa --limit 10000 query arxiv_id:* > /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json
+ => Expecting 1505184 release objects in search queries
+
+ cat /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 22
+
+Repeating this every few days should (?) result in all the backlog of arxiv
+papers getting indexed. Could focus on recent years to start (with query
+filter).
+
+## re-ingest spn2 errors (all time)
+
+Eg:
+
+ spn2-cdx-lookup-failure: 143963
+ spn-error: 101773
+ spn2-error: 16342
+
+TODO
+
+## re-try CDX errors
+
+Eg, for unpaywall only, bulk ingest all `cdx-error`.
+
+TODO
+
+## live ingest unpaywall `no-capture` URLs
+
+After re-trying the CDX errors for unpaywall URLs (see above), count all the
+no-capture URLs, and if reasonable recrawl them all in live more ("reasonable"
+meaning fewer than 200k or so URLs).
+
+Could also force recrawl (not using CDX lookups) for some publisher platforms
+if that made sense.
+
+TODO
diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md
new file mode 100644
index 0000000..39c84dd
--- /dev/null
+++ b/notes/tasks/2020-08-20_file_meta.md
@@ -0,0 +1,66 @@
+
+Want to update fatcat file entities with "full" file metadata for those which are missing it.
+
+How many `file_meta` rows *still* don't have metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 62962
+
+First generate list of sha1hex from most recent bulk export which are missing
+at least some metadata (based on missing sha256):
+
+ zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv
+ => 18.7M 0:05:46 [53.8k/s]
+
+Then dump the entire sandcrawler `file_meta` table as TSV, with first column
+sha1hex and second column JSON with all the file metadata fields:
+
+ COPY (
+ SELECT sha1hex, row_to_json(file_meta)
+ FROM file_meta
+ WHERE sha256hex IS NOT NULL
+ ORDER BY sha1hex ASC
+ )
+ TO '/grande/snapshots/file_meta_dump.tsv'
+ WITH NULL '';
+
+Join/cut:
+
+ export LC_ALL=C
+ join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json
+ => 18.1M 0:03:37 [83.2k/s]
+
+Check counts:
+
+ cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l
+ => 18135313
+
+ zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr
+ 18103860 application/pdf
+ 29977 application/octet-stream
+ 876 text/html
+ 199 application/postscript
+ 171 application/gzip
+ 84 text/plain
+ 48 application/xml
+ 38 application/vnd.ms-powerpoint
+ 16 application/msword
+ 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document
+ 6 image/jpeg
+ 4 message/rfc822
+ 4 application/zip
+ 4 application/vnd.openxmlformats-officedocument.presentationml.presentation
+ 3 text/x-tex
+ 3 application/x-dosexec
+ 2 application/x-tar
+ 2 application/vnd.ms-tnef
+ 1 video/mpeg
+ 1 image/tiff
+ 1 image/svg+xml
+ 1 image/png
+ 1 image/gif
+ 1 audio/x-ape
+ 1 application/vnd.ms-office
+ 1 application/CDFV2-unknown
+
+TODO: fatcat importer
diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md
new file mode 100644
index 0000000..c0bb65e
--- /dev/null
+++ b/notes/tasks/2020-10-21_pdfextract_holes.md
@@ -0,0 +1,74 @@
+
+Realized I had not enabled persisting of PDF extraction results (thumbnail,
+text) in ingest worker when added over the summer. So now need to run a
+catch-up. This applied to both "live" and "bulk" ingest.
+
+## `cdx` / `ingest` / `grobid` catch-up
+
+First, re-run extraction for cases where we did an ingest, and grobid ran
+successfully, and we have a CDX row, but no `pdf_meta`:
+
+ -- this is a slow query
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+ WITH NULL '';
+ => 19,676,116
+
+Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably.
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And again, after a couple partitions got hung up:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json'
+ WITH NULL '';
+
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ => 562k 0:00:16 [34.6k/s]
+
+## `petabox` / `grobid` catch-up
+
+These didn't all seem to extract correctly before after 1.5m rows, there will
+still 900k unprocessed. Trying again.
+
+ COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM grobid
+ LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE petabox.sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json'
+ WITH NULL '';
+
+ cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## `cdx` / `grobid` catch-up
+
+Next will be to process PDFs with GROBID and CDX but no ingest.
+
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..52a3264
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,66 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+ COPY (
+ SELECT
+ base_url as start_url,
+ terminal_url as pdf_url,
+ terminal_dt as pdf_url_timestamp,
+ terminal_sha1hex as pdf_sha1hex
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+ WITH NULL '';
+ => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.url as pdf_url,
+ cdx.datetime as pdf_url_timestamp,
+ cdx.sha1hex as pdf_sha1hex
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+ WITH NULL '';
+ => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)
diff --git a/notes/url_pattern_heuristic_backfill.txt b/notes/url_pattern_heuristic_backfill.txt
new file mode 100644
index 0000000..8e422f5
--- /dev/null
+++ b/notes/url_pattern_heuristic_backfill.txt
@@ -0,0 +1,104 @@
+
+/user/bnewbold/pdfs/gwb-pdf-20171227034923-surt-filter
+ 21,434,960
+
+/user/bnewbold/pdfs/gwb-pdf-20171227034923-join-msag
+ 13,637,948
+
+/user/bnewbold/pdfs/gwb-pdf-20171227034923-join-unpaywall-20180329
+ 3,393,658
+
+#########
+
+Goal: backfill a bunch of existing content into the HBase table. Bonus for
+being re-runable in the future.
+
+Source data:
+- GWB PDF CDX list
+- archive.org JSTOR files (?)
+- arxiv.org bulk files (?)
+- large URL lists (MSAG, etc)
+
+Methods:
+- pig filter GWB PDF CDX list based on regexes
+- pig join GWB PDF CDX list to known URL lists (then remove join)
+x iterate URL lists, hitting CDX API and saving response
+
+
+- (.edu, .ac.uk) domain with a tilde in the URL
+
+#http://www.stanford.edu:80/~johntayl/Papers/taylor2.pdf
+#http://met.nps.edu/~mtmontgo/papers/isabel_part2.pdf
+#http://www.pitt.edu:80/~druzdzel/psfiles/ecai06.pdf
+#http://www.comp.hkbu.edu.hk/~ymc/papers/conference/ijcnn03_710.pdf
+
+hk,edu,hkbu,comp)/~ymc/papers/conference/ijcnn03_710.pdf
+edu,stanford,www)/~johntayl/Papers/taylor2.pdf
+edu,nps,met)/~mtmontgo/papers/isabel_part2.pdf
+edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf
+jp,ak,pitt,www)/~druzdzel/psfiles/ecai06.pdf
+co,edu,pitt,www)/~druzdzel/psfiles/ecai06.pdf
+
+NOT: com,corp,edu,,www)/~druzdzel/psfiles/ecai06.pdf
+
+- the words in URL: paper(s), pubs, research, publications, article, proceedings
+
+#http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf
+#http://files.eric.ed.gov/fulltext/EJ798626.pdf
+#http://www.hbs.edu/research/pdf/10-108.pdf
+#http://www.unifr.ch/biochem/assets/files/albrecht/publications/Abraham06.pdf
+#http://www.cnbc.cmu.edu/cns/papers/Kassetal2005.pdf
+#http://www.macrothink.org/journal/index.php/ijhrs/article/download/5765/4663
+#http://www.pims.math.ca:80/science/2004/fpsac/Papers/Liskovets.pdf
+#http://www.risc.uni-linz.ac.at/publications/download/risc_3287/synasc_revised.pdf
+#http://softsys.cs.uoi.gr/dbglobe/publications/wi04.pdf
+#http://lexikos.journals.ac.za/pub/article/download/1048/564
+#http://www.siam.org/proceedings/analco/2007/anl07_029ecesaratto.pdf
+#http://www.cs.bris.ac.uk/Publications/Papers/2000249.pdf
+
+uk,ac,surrey,ee,personal)/Personal/R.Bowden/publications/2012/Gilbert_ACCV_2012pp.pdf
+gov,ed,eric,files)/fulltext/EJ798626.pdf
+edu,hbs,www)/research/pdf/10-108.pdf
+ch,unifr,www)/biochem/assets/files/albrecht/publications/Abraham06.pdf
+edu,cmu,cnbc,www)/cns/papers/Kassetal2005.pdf
+org,macrothink,www)/journal/index.php/ijhrs/article/download/5765/4663
+ca,math,pims,www)/science/2004/fpsac/Papers/Liskovets.pdf
+at,ac,uni-linz,risc,www)/publications/download/risc_3287/synasc_revised.pdf
+gr,uoi,cs,softsys)/dbglobe/publications/wi04.pdf
+za,ac,journals,lexikos)/pub/article/download/1048/564
+org,siam,www)/proceedings/analco/2007/anl07_029ecesaratto.pdf
+uk,ac,bris,cs,www)/Publications/Papers/2000249.pdf
+
+
+- words in domains: hal., eprint, research., journal
+
+#http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
+#http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf
+#http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf
+#http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf
+
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
+org,sgmjournals,ijs)//cgi/reprint/54/6/2217.pdf
+uk,ac,soton,ecs,eprints)/12020/1/mind-the-semantic-gap.pdf
+au,edu,uq,eprint)/archive/00004120/01/R103_Forrester_pp.pdf
+
+- doi-like pattern in URL
+#http://journals.ametsoc.org/doi/pdf/10.1175/2008BAMS2370.1
+#http://www.nejm.org:80/doi/pdf/10.1056/NEJMoa1013607
+
+org,ametsoc,journals)/doi/pdf/10.1175/2008BAMS2370.1
+org,nejm,www)/doi/pdf/10.1056/NEJMoa1013607
+
+- short list of hosts/domains?
+ *.core.ac.uk
+ *scielo*
+ *.redalyc.org
+
+#http://www.scielo.br:80/pdf/cagro/v33n1/v33n1a19.pdf
+#https://revistas.unal.edu.co/index.php/dyna/article/viewFile/51385/57892
+#http://rives.revues.org:80/pdf/449
+
+br,scielo,www)/pdf/cagro/v33n1/v33n1a19.pdf
+co,edu,unal,revistas)/index.php/dyna/article/viewFile/51385/57892
+org,revues,rives)/pdf/449
+
diff --git a/notes/url_pattern_heuristic_verification.txt b/notes/url_pattern_heuristic_verification.txt
new file mode 100644
index 0000000..7b35b88
--- /dev/null
+++ b/notes/url_pattern_heuristic_verification.txt
@@ -0,0 +1,52 @@
+
+## URL pattern regexing
+
+/user/bnewbold/pdfs/gwb-pdf-20171227034923-surt-filter/part*
+
+N https://nsarchive2.gwu.edu//rus/text_files/Volkogonov/1918.10.13%20Speech%20by%20BK,%20to%20Red%20Army%20Soldiers,%20R13977.pdf speech, russian
+
+edu tilde:
+ N http://www.d.umn.edu/~kgilbert/ened3342-1/Field%20Interp%202/snow/CloudIDKey.pdf homework?
+ N http://www.mech.utah.edu/~minor/BIOSKETCH-minor-october%202007.pdf CV
+ N http://web.archive.org/web/20030724175610/http://www.ssc.wisc.edu:80/~sseverin/lect12f01.pdf slides
+ N http://web.archive.org/web/20050117195001/http://www.csie.ntu.edu.tw:80/~b90013/DBhw7.pdf
+ Y http://web.archive.org/web/20040220222413/http://homepages.uc.edu:80/~lukovib/aiaa_02_0857.pdf
+ Y http://www.kki.yamanashi.ac.jp/~ohbuchi/online_pubs/IEEE_bigMM2015_Matsuda/BigMM_20150224b_web.pdf
+
+other words:
+ N https://files.eric.ed.gov/fulltext/ED069848.pdf tech report?
+ N http://istitutocomprensivopescara2.gov.it/attachments/article/164/griglia_osservativa_bes_terza_fascia.pdf table
+ M https://jfjustice.net/userfiles/file/Research/Report%20of%20the%20Outreach%20Forums%20on%20the%20PIL%20Cases%20on%20Sexual%20Gender%20Based%20Violence.pdf report
+ M http://www.iitk.ac.in/nicee/wcee/article/13_9035.pdf filler page? like a paper
+ Y http://www.dtic.mil/dtic/tr/fulltext/u2/314095.pdf
+ Y https://www.casact.org/pubs/proceed/proceed25/25400.pdf
+ Y http://circres.ahajournals.org/content/circresaha/111/8/1002.full.pdf
+ Y http://web.archive.org/web/20170313034332/http://thixomet.ru/UserFiles/File/Articles/1/2.CHM_2006_02-2.pdf
+ Y http://www.redalyc.org/pdf/873/87313713019.pdf
+ Y http://ukacc.group.shef.ac.uk/proceedings/control2004/Papers/213.pdf
+ Y http://periodicos.uem.br:80/ojs/index.php/RbhrAnpuh/article/download/23988/13095
+ Y http://w3.uqo.ca/photonique/papers/measurement.pdf
+ Y http://web.archive.org/web/20140312150030/http://afms.org.au/proceedings/9/Griffiths.pdf
+ Y http://www.hal.inserm.fr/file/index/docid/580194/filename/PROSTATE_SEGMENTATION_IN_HIFU_THERAPY.pdf
+ Y http://journal.ipb.ac.id/index.php/jmht/article/download/6003/4658
+
+publications:
+ N http://web.archive.org/web/20060527120026/http://www.merenkulkulaitos.fi:80/e/services/informationservices/publications/bulletin/avaa.php?id=336 treaty?
+ N http://orbit.dtu.dk/en/publications/status-for-skarven-i-danmark(8ffaf614-387e-429f-9fd4-4677ee5016ae).pdf?nofollow=true&rendering=standard related to a paper?
+ N http://community.trinity.nsw.edu.au/navbar/publications/docs/news/2_pn/2016/ps160103.pdf newsletter
+ N http://web.archive.org/web/20170216001602/https://www.nass.usda.gov/Statistics_by_State/New_Mexico/Publications/Annual_Statistical_Bulletin/2005/03_05.pdf report
+ N http://web.archive.org/web/20110109080048/http://www.ipria.org/publications/on-line-bulletins/austdev/AusDevsBulletin07.09.pdf
+ N http://web.archive.org/web/20060930192249/http://www.nmmfa.org/publications/CensusTracts/35031940200.pdf
+ N http://web.archive.org/web/20100621152841/http://psychologymatters.org/workforce/publications/01-doc-empl/table-11.pdf
+ N http://www.dtce.org.pk/DTCE/Publications/PN2 final report-dr8-F.pdf
+ Y https://www.frbatlanta.org/-/media/Documents/research/publications/wp/1995/wp9513.pdf
+ Y http://irrec.ifas.ufl.edu/IRSWS/publications/Lu_ESPR_2011.pdf
+
+doi:
+ M https://page-one.live.cf.public.springer.com/pdf/preview/10.1007/s11229-012-0117-8 paper, but only fragment (!?!?!)
+
+
+TODO:
+- drop "publications", "research", "pubs"
+- edu tilde is borderline... but keep it for now
+- black-list page-one.*
diff --git a/pig/README.md b/pig/README.md
index d14d2ae..df8ce68 100644
--- a/pig/README.md
+++ b/pig/README.md
@@ -12,12 +12,13 @@ by `fetch_deps.sh`) due to [dependency/jar issues][pig-bug] in local mode of
To run tests, you need Java installed and `JAVA_HOME` configured.
-Fetch dependencies (pig):
+Fetch dependencies (including pig) from top-level directory:
- ./fetch_deps.sh
+ ./fetch_hadoop.sh
-Write .pig scripts here, and add a pytho wrapper test to `./tests/` when done.
-Test vector files (input/output) can go in `./tests/files/`.
+Write `.pig` scripts in this directory, and add a python wrapper test to
+`./tests/` when done. Test vector files (input/output) can go in
+`./tests/files/`.
Run the tests with:
diff --git a/pig/fetch_deps.sh b/pig/fetch_deps.sh
deleted file mode 100755
index 4cefa5e..0000000
--- a/pig/fetch_deps.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-#PIG_VERSION="0.12.0-cdh5.2.0"
-# Using more recent version to work around snappy classpath problem
-PIG_VERSION="0.17.0"
-JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")
-
-mkdir -p deps/
-cd deps/
-
-# Fetch Pig
-#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
-#wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz
-wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz
-tar xvf pig-${PIG_VERSION}.tar.gz
-ln -fs pig-${PIG_VERSION} pig
-./pig/bin/pig -x local -version
-
diff --git a/pig/filter-cdx-paper-pdfs.pig b/pig/filter-cdx-paper-pdfs.pig
index 7e10720..402d340 100644
--- a/pig/filter-cdx-paper-pdfs.pig
+++ b/pig/filter-cdx-paper-pdfs.pig
@@ -30,7 +30,7 @@ cdx = FILTER cdx
OR surt matches '(?i).+\\).*/(pubs|research|publications?|articles?|proceedings?|papers?|fulltext)/.*'
-- words in domains
- OR surt matches '.*(,hal|,eprint|scielo|redalyc|revues|revistas|research|journal).*\\).*'
+ OR surt matches '.*(,hal|,eprint|,ojs|,dspace|scielo|redalyc|revues|revistas|research|journal).*\\).*'
-- DOI-like pattern in URL
OR surt matches '.*\\).*/10\\.\\d{3,5}/.*';
diff --git a/pig/filter-cdx-pdfs.pig b/pig/filter-cdx-pdfs.pig
new file mode 100644
index 0000000..a2882ac
--- /dev/null
+++ b/pig/filter-cdx-pdfs.pig
@@ -0,0 +1,24 @@
+
+-- Tries to filter down a large CDX file (GWB index) to a subset of PDFs, by mimetype.
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FILTER cdx BY not url matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*pdf.*';
+cdx = ORDER cdx by url, timestamp PARALLEL 50;
+cdx = FOREACH cdx GENERATE cdxline;
+STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/filter-cdx-ps.pig b/pig/filter-cdx-ps.pig
index 6e80acc..b27a547 100644
--- a/pig/filter-cdx-ps.pig
+++ b/pig/filter-cdx-ps.pig
@@ -1,3 +1,9 @@
+-- Tries to filter down a large CDX file (GWB index) to a subset of postscript
+-- files, by mimetype.
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
%default INPUT ''
%default OUTPUT ''
diff --git a/pig/filter-cdx-source-code-crude.pig b/pig/filter-cdx-source-code-crude.pig
new file mode 100644
index 0000000..589aebd
--- /dev/null
+++ b/pig/filter-cdx-source-code-crude.pig
@@ -0,0 +1,40 @@
+
+-- Tries to filter down a large CDX file (GWB index) to a subset of source code
+-- files by mimetype and file extension.
+-- This is pretty crude and requires the URL to end with the file extension.
+---
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: October 2019
+
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
+cdx = FILTER cdx BY not surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*text.*';
+
+-- This is the core regex
+cdx = FILTER cdx
+
+ -- file suffix
+ BY surt matches '.*\\).*\\.(c|h|py|java)';
+
+-- DISTINCT by sha1 column
+cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
+cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
+STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/filter-cdx-tarball.pig b/pig/filter-cdx-tarball.pig
new file mode 100644
index 0000000..d0be0f7
--- /dev/null
+++ b/pig/filter-cdx-tarball.pig
@@ -0,0 +1,38 @@
+
+-- Tries to filter down a large CDX file (GWB index) to a subset of tarballs
+-- (.tar.gz). Intention is to find software code that isn't in, eg, git.
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
+cdx = FILTER cdx BY not surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*(octet|gzip|gtar|tgz).*';
+
+-- This is the core regex
+cdx = FILTER cdx
+ -- .tar.gz in URL
+ BY surt matches '(?i).+\\).*\\.tar\\.gz.*';
+
+-- DISTINCT by sha1 column
+cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
+cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
+STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/join-cdx-sha1.pig b/pig/join-cdx-sha1.pig
new file mode 100644
index 0000000..86b9bb6
--- /dev/null
+++ b/pig/join-cdx-sha1.pig
@@ -0,0 +1,43 @@
+
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: December 2020
+--
+-- This pig script is intended to run agains the full (many TByte) GWB CDX, and
+-- catch captures that match exact SHA1 (b32 encoded), regardless of mimetype.
+--
+-- The process is to filter the CDX for non-revisit HTTP 200s, sort this by
+-- SHA1 digest, then join with the (pre-sorted) SHA1 -- b32 input list, and dump
+-- output.
+
+%default INPUT_CDX ''
+%default INPUT_DIGEST ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+digests = LOAD '$INPUT_DIGEST' AS sha1b32:chararray;
+digests = ORDER digests by sha1b32 ASC PARALLEL 20;
+digests = DISTINCT digests;
+
+cdx = LOAD '$INPUT_CDX' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1b32, cdxline;
+cdx = FILTER cdx BY not cdx_surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY not mimetype matches 'warc/revisit';
+cdx = ORDER cdx BY sha1b32 ASC PARALLEL 40;
+
+-- TODO: DISTINCT by (sha1b32, cdx_surt) for efficiency
+
+-- Core JOIN
+full_join = JOIN cdx BY sha1b32, digests BY sha1b32;
+
+-- TODO: at most, say 5 CDX lines per sha1b32?
+
+result = FOREACH full_join GENERATE cdxline;
+
+STORE result INTO '$OUTPUT' USING PigStorage();
diff --git a/pig/tests/files/example.sha1b32 b/pig/tests/files/example.sha1b32
new file mode 100644
index 0000000..20a1357
--- /dev/null
+++ b/pig/tests/files/example.sha1b32
@@ -0,0 +1,4 @@
+EJWYVOPONJRARK7SGG6COFRN7CSTHROY
+V32E3CCO7NMI2M4OHLKG73DXD72LR4B2
+3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK
diff --git a/pig/tests/files/sourcecode.cdx b/pig/tests/files/sourcecode.cdx
new file mode 100644
index 0000000..eeb397c
--- /dev/null
+++ b/pig/tests/files/sourcecode.cdx
@@ -0,0 +1,6 @@
+# match
+edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.java 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.java text/plain 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+# no
+fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pyc 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pyc text/plain 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+# no
+org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
diff --git a/pig/tests/files/tarballs.cdx b/pig/tests/files/tarballs.cdx
new file mode 100644
index 0000000..7a81b79
--- /dev/null
+++ b/pig/tests/files/tarballs.cdx
@@ -0,0 +1,10 @@
+#http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
+#http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf
+#http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf
+#http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf
+
+# should match 2:
+
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/octet-stream 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
+org,sgmjournals,ijs)//cgi/reprint/54/6/2217.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/gzip 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/pighelper.py b/pig/tests/pighelper.py
index 4aa4259..95e0426 100644
--- a/pig/tests/pighelper.py
+++ b/pig/tests/pighelper.py
@@ -17,6 +17,9 @@ import unittest
import subprocess
+def count_lines(s):
+ return len([l for l in s.strip().split('\n') if len(l) > 0])
+
class PigTestHelper(unittest.TestCase):
@classmethod
@@ -50,7 +53,7 @@ class PigTestHelper(unittest.TestCase):
return retval
def run_pig(self, script_path, in_file, **kwargs):
- """Convenience helper around run_pig().
+ """Convenience helper around run_pig_raw().
INPUT parameter is set to in_file.
OUTPUT parameter is set to a random file.
diff --git a/pig/tests/test_filter_cdx_paper_pdfs.py b/pig/tests/test_filter_cdx_paper_pdfs.py
index a8ebd9f..c2d2e6b 100644
--- a/pig/tests/test_filter_cdx_paper_pdfs.py
+++ b/pig/tests/test_filter_cdx_paper_pdfs.py
@@ -1,10 +1,8 @@
import os
import unittest
-from pighelper import PigTestHelper
+from pighelper import PigTestHelper, count_lines
-def count_lines(s):
- return len([l for l in s.strip().split('\n') if len(l) > 0])
class TestFilterCDXPaperPdfs(PigTestHelper):
diff --git a/pig/tests/test_filter_software.py b/pig/tests/test_filter_software.py
new file mode 100644
index 0000000..cce90b4
--- /dev/null
+++ b/pig/tests/test_filter_software.py
@@ -0,0 +1,16 @@
+
+import os
+import unittest
+from pighelper import PigTestHelper, count_lines
+
+
+class TestFilterCDXSoftware(PigTestHelper):
+
+ def test_tarballs(self):
+ r = self.run_pig("filter-cdx-tarball.pig", "tests/files/tarballs.cdx")
+ assert count_lines(r) == 2
+
+ def test_source_code(self):
+ r = self.run_pig("filter-cdx-source-code-crude.pig", "tests/files/sourcecode.cdx")
+ assert count_lines(r) == 1
+
diff --git a/pig/tests/test_join_cdx.py b/pig/tests/test_join_cdx.py
new file mode 100644
index 0000000..e6eca6a
--- /dev/null
+++ b/pig/tests/test_join_cdx.py
@@ -0,0 +1,44 @@
+
+import os
+import unittest
+import tempfile
+import subprocess
+from pighelper import PigTestHelper, count_lines
+
+class TestJoinCDXSha1(PigTestHelper):
+
+ def run_pig_join(self, script_path, cdx_file, digest_file, **kwargs):
+ """Convenience helper around run_pig().
+
+ INPUT parameter is set to in_file.
+ OUTPUT parameter is set to a random file.
+ Any keyword args are passed as parameters.
+ """
+
+ pargs = []
+ for key, value in kwargs.items():
+ pargs.append('-p')
+ pargs.append('{}={}'.format(key, value))
+
+ out_file = tempfile.mktemp(dir=self._tmpdir)
+ params = [
+ '-f', script_path,
+ '-p', 'INPUT_CDX={}'.format(cdx_file),
+ '-p', 'INPUT_DIGEST={}'.format(digest_file),
+ '-p', 'OUTPUT={}'.format(out_file),
+ ] + pargs
+ status = self.run_pig_raw(params)
+ assert status.returncode == 0
+ # Capture all the part-r-* files
+ print("out_file: {}".format(out_file))
+ subprocess.run("/bin/ls -la {}/part-*".format(out_file), shell=True)
+ sub = subprocess.run("/bin/cat {}/part-*".format(out_file), stdout=subprocess.PIPE, shell=True)
+ out = sub.stdout.decode('utf-8')
+ print(out)
+ return out
+
+ # TODO: helper to verify that output matches an expected file
+
+ def test_thing(self):
+ r = self.run_pig_join("join-cdx-sha1.pig", "tests/files/example.cdx", "tests/files/example.sha1b32")
+ assert count_lines(r) == 4
diff --git a/please b/please
index 119790d..298a1c5 100755
--- a/please
+++ b/please
@@ -2,8 +2,8 @@
"""
Helper script for running Sandcrawler (journal pipeline) tasks in production.
-This is basically a Makefile. Be sure to only use python3 standard library
-modules, so there are no dependencies.
+This is basically a Makefile. If you edit this file, be sure to only use
+python3 standard library modules, so there are no dependencies.
"""
import sys
@@ -16,9 +16,13 @@ HBASE_HOST = "wbgrp-svc263.us.archive.org"
ZOOKEEPER_HOSTS = "mtrcs-zk1.us.archive.org:2181"
GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
+# Staging config
+#HBASE_HOST = "wbgrp-svc312.us.archive.org"
+#ZOOKEEPER_HOSTS = "wbgrp-svc312.us.archive.org:2181"
+
def rebuild_python():
print("Rebuilding python venv...")
- cmd = """cd mapreduce;
+ cmd = """cd python_hadoop;
export PIPENV_VENV_IN_PROJECT=1;
pipenv install --deploy
tar -czf venv-current.tar.gz -C .venv ."""
@@ -37,7 +41,7 @@ def run_backfill(args):
HDFS_DIR,
args.env,
datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
- cmd = """cd mapreduce;
+ cmd = """cd python_hadoop;
pipenv run ./backfill_hbase_from_cdx.py \
--hbase-host {hbase_host} \
--hbase-table wbgrp-journal-extract-0-{env} \
@@ -57,23 +61,54 @@ def run_extract(args):
HDFS_DIR,
args.env,
datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
- cmd = """cd mapreduce;
+ cmd = """cd python_hadoop;
pipenv run ./extraction_cdx_grobid.py \
--hbase-host {hbase_host} \
--hbase-table wbgrp-journal-extract-0-{env} \
--grobid-uri {grobid_uri} \
-r hadoop \
-c mrjob.conf \
+ --output-dir {output} \
+ --no-output \
--archive venv-current.tar.gz#venv \
--jobconf mapred.line.input.format.linespermap=8000 \
--jobconf mapreduce.job.queuename=extraction \
--jobconf mapred.task.timeout=3600000 \
{input_cdx}
""".format(hbase_host=HBASE_HOST, env=args.env,
+ output=output,
input_cdx=args.input_cdx,
grobid_uri=GROBID_URI)
subprocess.call(cmd, shell=True)
+def run_extract_ungrobided(args):
+ if args.rebuild:
+ rebuild_python()
+ print("Starting extractungrobided job...")
+ output = "{}/output-{}/{}-extract-ungrobided".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """cd python_hadoop;
+ pipenv run ./extraction_ungrobided.py \
+ --hbase-host {hbase_host} \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --grobid-uri {grobid_uri} \
+ -r hadoop \
+ -c mrjob.conf \
+ --output-dir {output} \
+ --no-output \
+ --archive venv-current.tar.gz#venv \
+ --jobconf mapred.line.input.format.linespermap=8000 \
+ --jobconf mapreduce.job.queuename=extraction \
+ --jobconf mapred.task.timeout=3600000 \
+ {input_ungrobided}
+ """.format(hbase_host=HBASE_HOST, env=args.env,
+ input_ungrobided=args.input_ungrobided,
+ output=output,
+ grobid_uri=GROBID_URI)
+ subprocess.call(cmd, shell=True)
+
def run_rowcount(args):
if args.rebuild:
rebuild_scalding()
@@ -95,6 +130,27 @@ def run_rowcount(args):
env=args.env)
subprocess.call(cmd, shell=True)
+def run_statuscodecount(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting statuscodecount job...")
+ output = "{}/output-{}/{}-statuscodecount".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.HBaseStatusCodeCountJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
def run_statuscount(args):
if args.rebuild:
rebuild_scalding()
@@ -105,7 +161,322 @@ def run_statuscount(args):
datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
cmd = """hadoop jar \
scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
- com.twitter.scalding.Tool sandcrawler.HBaseStatusCountJob \
+ com.twitter.scalding.Tool \
+ -Dmapred.task.timeout=3600000 \
+ sandcrawler.HBaseStatusCountJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_matchcrossref(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting matchcrossref job...")
+ output = "{}/output-{}/{}-matchcrossref".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ if args.fatcat_insertable:
+ jobclass = "ScoreInsertableJob"
+ else:
+ jobclass = "ScoreJob"
+ # Notes: -D options must come after Tool but before class name
+ # https://github.com/twitter/scalding/wiki/Frequently-asked-questions#how-do-i-pass-parameters-to-my-hadoop-job-number-of-reducers--memory-options--etc-
+ # Compression: changed due to errors in production
+ # https://stackoverflow.com/a/11336820/4682349
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool \
+ -Dmapred.reduce.tasks={reducers} \
+ -Dcascading.spill.list.threshold=500000 \
+ -Dmapred.output.compress=false \
+ -Dmapred.compress.map.output=true \
+ -Dmapred.task.timeout=3600000 \
+ sandcrawler.{jobclass} \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --crossref-input {crossref_input} \
+ --output {output}""".format(
+ output=output,
+ jobclass=jobclass,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env,
+ reducers=args.reducers,
+ crossref_input=args.crossref_input)
+ subprocess.call(cmd, shell=True)
+
+def run_groupworks(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting groupworks job...")
+ output = "{}/output-{}/{}-groupworks".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ jobclass = "GroupFatcatWorksJob"
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool \
+ -Dmapred.reduce.tasks={reducers} \
+ -Dcascading.spill.list.threshold=500000 \
+ -Dmapred.output.compress=false \
+ -Dmapred.compress.map.output=true \
+ -Dmapred.task.timeout=3600000 \
+ sandcrawler.{jobclass} \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --fatcat-release-input {fatcat_release_input} \
+ --output {output}""".format(
+ output=output,
+ jobclass=jobclass,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env,
+ reducers=args.reducers,
+ fatcat_release_input=args.fatcat_release_input)
+ subprocess.call(cmd, shell=True)
+
+def run_groupworkssubset(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting groupworkssubset job...")
+ output = "{}/output-{}/{}-groupworkssubset".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ jobclass = "GroupFatcatWorksSubsetJob"
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool \
+ -Dmapred.reduce.tasks={reducers} \
+ -Dcascading.spill.list.threshold=500000 \
+ -Dmapred.output.compress=false \
+ -Dmapred.compress.map.output=true \
+ -Dmapred.task.timeout=3600000 \
+ sandcrawler.{jobclass} \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --fatcat-release-input {fatcat_release_input_left} \
+ --fatcat-release-input-right {fatcat_release_input_right} \
+ --output {output}""".format(
+ output=output,
+ jobclass=jobclass,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env,
+ reducers=args.reducers,
+ fatcat_release_input_left=args.fatcat_release_input_left,
+ fatcat_release_input_right=args.fatcat_release_input_right)
+ subprocess.call(cmd, shell=True)
+
+def run_grobidscorabledump(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting grobid-scorable-dump job...")
+ output = "{}/output-{}/{}-grobidscorabledump".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.GrobidScorableDumpJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_dumpfilemeta(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting dumpfilemeta job...")
+ output = "{}/output-{}/{}-dumpfilemeta".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.DumpFileMetaJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_dumpgrobidstatuscode(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting dumpgrobidstatuscode job...")
+ output = "{}/output-{}/{}-dumpgrobidstatuscode".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.DumpGrobidStatusCodeJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_dumpgrobidmetainsertable(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting dumpgrobidmetainsertable job...")
+ output = "{}/output-{}/{}-dumpgrobidmetainsertable".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.DumpGrobidMetaInsertableJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_dumpgrobidxml(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting dumpgrobidxml job...")
+ output = "{}/output-{}/{}-dumpgrobidxml".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.DumpGrobidXmlJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_colcount(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting colcount job...")
+ output = "{}/output-{}/{}-colcount-{}".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"),
+ args.column.replace(':', '_'))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.HBaseColCountJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --column {column} \
+ --output {output}""".format(
+ column=args.column,
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_matchbenchmark(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting matchbenchmark job...")
+ cmd = """./pig/deps/hadoop/bin/hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool \
+ sandcrawler.MatchBenchmarkJob \
+ --local \
+ --app.conf.path scalding/ia_cluster.conf \
+ --left-bibjson {left_bibjson} \
+ --right-bibjson {right_bibjson} \
+ --output {output}""".format(
+ output=args.output,
+ left_bibjson=args.left_bibjson,
+ right_bibjson=args.right_bibjson)
+ subprocess.call(cmd, shell=True)
+
+def run_groupworksbenchmark(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting groupworksbenchmark job...")
+ cmd = """./pig/deps/hadoop/bin/hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool \
+ sandcrawler.GroupFatcatWorksJob \
+ --local \
+ --app.conf.path scalding/ia_cluster.conf \
+ --fatcat-release-input {fatcat_releases} \
+ --output {output}""".format(
+ output=args.output,
+ fatcat_releases=args.fatcat_releases)
+ subprocess.call(cmd, shell=True)
+
+def run_keysmissingcol(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting keysmissingcol job...")
+ output = "{}/output-{}/{}-keysmissingcol-{}".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"),
+ args.column.replace(":", "_"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.MissingColumnDumpJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --column {column} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ column=args.column,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
+def run_dumpungrobided(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting dumpungrobided job...")
+ output = "{}/output-{}/{}-dumpungrobided".format(
+ HDFS_DIR,
+ args.env,
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.DumpUnGrobidedJob \
--hdfs \
--app.conf.path scalding/ia_cluster.conf \
--hbase-table wbgrp-journal-extract-0-{env} \
@@ -162,18 +533,100 @@ def main():
sub_extract.add_argument('input_cdx',
help="full HDFS path of CDX file to extract")
+ sub_extractungrobided = subparsers.add_parser('extract-ungrobided')
+ sub_extractungrobided.set_defaults(func=run_extract_ungrobided)
+ sub_extractungrobided.add_argument('input_ungrobided',
+ help="full HDFS path of 'ungrobided' file to extract")
+
sub_rowcount = subparsers.add_parser('row-count')
sub_rowcount.set_defaults(func=run_rowcount)
sub_statuscount = subparsers.add_parser('status-count')
sub_statuscount.set_defaults(func=run_statuscount)
+ sub_statuscodecount = subparsers.add_parser('status-code-count')
+ sub_statuscodecount.set_defaults(func=run_statuscodecount)
+
+ sub_matchcrossref = subparsers.add_parser('match-crossref')
+ sub_matchcrossref.set_defaults(func=run_matchcrossref)
+ sub_matchcrossref.add_argument('crossref_input',
+ help="full HDFS path of Crossref JSON dump")
+ sub_matchcrossref.add_argument('--reducers',
+ help="number of reducers to run",
+ type=int, default=200)
+ sub_matchcrossref.add_argument('--fatcat-insertable',
+ help="whether to include CDX and other metadata in output",
+ action='store_true')
+
+ sub_groupworks = subparsers.add_parser('groupworks-fatcat')
+ sub_groupworks.set_defaults(func=run_groupworks)
+ sub_groupworks.add_argument('fatcat_release_input',
+ help="full HDFS path of fatcat release JSON dump")
+ sub_groupworks.add_argument('--reducers',
+ help="number of reducers to run",
+ type=int, default=400)
+
+ sub_groupworkssubset = subparsers.add_parser('groupworkssubset-fatcat')
+ sub_groupworkssubset.set_defaults(func=run_groupworkssubset)
+ sub_groupworkssubset.add_argument('fatcat_release_input_left',
+ help="full HDFS path of fatcat release JSON dump (LHS of join)")
+ sub_groupworkssubset.add_argument('fatcat_release_input_right',
+ help="full HDFS path of fatcat release JSON dump (RHS of join)")
+ sub_groupworkssubset.add_argument('--reducers',
+ help="number of reducers to run",
+ type=int, default=200)
+
+ sub_grobidscorabledump = subparsers.add_parser('grobid-scorable-dump')
+ sub_grobidscorabledump.set_defaults(func=run_grobidscorabledump)
+
+ sub_dumpfilemeta = subparsers.add_parser('dump-file-meta')
+ sub_dumpfilemeta.set_defaults(func=run_dumpfilemeta)
+
+ sub_dumpgrobidstatuscode = subparsers.add_parser('dump-grobid-status-code')
+ sub_dumpgrobidstatuscode.set_defaults(func=run_dumpgrobidstatuscode)
+
+ sub_dumpgrobidmetainsertable = subparsers.add_parser('dump-grobid-meta-insertable')
+ sub_dumpgrobidmetainsertable.set_defaults(func=run_dumpgrobidmetainsertable)
+
+ sub_dumpgrobidxml = subparsers.add_parser('dump-grobid-xml')
+ sub_dumpgrobidxml.set_defaults(func=run_dumpgrobidxml)
+
+ sub_colcount = subparsers.add_parser('col-count')
+ sub_colcount.set_defaults(func=run_colcount)
+ sub_colcount.add_argument('column',
+ help="column name to use in count")
+
+ sub_matchbenchmark = subparsers.add_parser('match-benchmark')
+ sub_matchbenchmark.set_defaults(func=run_matchbenchmark)
+ sub_matchbenchmark.add_argument('left_bibjson',
+ help="First bibjson file")
+ sub_matchbenchmark.add_argument('right_bibjson',
+ help="Second bibjson file")
+ sub_matchbenchmark.add_argument('output',
+ help="where to write output")
+
+ sub_groupworksbenchmark = subparsers.add_parser('groupworks-benchmark')
+ sub_groupworksbenchmark.set_defaults(func=run_groupworksbenchmark)
+ sub_groupworksbenchmark.add_argument('fatcat_releases',
+ help="fatcat releases json file")
+ sub_groupworksbenchmark.add_argument('output',
+ help="where to write output")
+
+ sub_keysmissingcol = subparsers.add_parser('keys-missing-col')
+ sub_keysmissingcol.set_defaults(func=run_keysmissingcol)
+ sub_keysmissingcol.add_argument('column',
+ help="column to SCAN for missing keys")
+
+ sub_dumpungrobided = subparsers.add_parser('dump-ungrobided')
+ sub_dumpungrobided.set_defaults(func=run_dumpungrobided)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do! (try --help)")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
if not (args.prod or args.qa) or (args.prod and args.qa):
print("must pass one of --prod or --qa")
+ sys.exit(-1)
if args.prod:
args.env = "prod"
if args.qa:
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
new file mode 100644
index 0000000..c649809
--- /dev/null
+++ b/proposals/2019_ingest.md
@@ -0,0 +1,287 @@
+
+status: work-in-progress
+
+This document proposes structure and systems for ingesting (crawling) paper
+PDFs and other content as part of sandcrawler.
+
+## Overview
+
+The main abstraction is a sandcrawler "ingest request" object, which can be
+created and submitted to one of several systems for automatic harvesting,
+resulting in an "ingest result" metadata object. This result should contain
+enough metadata to be automatically imported into fatcat as a file/release
+mapping.
+
+The structure and pipelines should be flexible enough to work with individual
+PDF files, web captures, and datasets. It should work for on-demand
+(interactive) ingest (for "save paper now" features), soft-real-time
+(hourly/daily/queued), batches of hundreds or thousands of requests, and scale
+up to batch ingest crawls of tens of millions of URLs. Most code should not
+care about how or when content is actually crawled.
+
+The motivation for this structure is to consolidate and automate the current ad
+hoc systems for crawling, matching, and importing into fatcat. It is likely
+that there will still be a few special cases with their own importers, but the
+goal is that in almost all cases that we discover a new structured source of
+content to ingest (eg, a new manifest of identifiers to URLs), we can quickly
+transform the task into a list of ingest requests, then submit those requests
+to an automated system to have them archived and inserted into fatcat with as
+little manual effort as possible.
+
+## Use Cases and Workflows
+
+### Unpaywall Example
+
+As a motivating example, consider how unpaywall crawls are done today:
+
+- download and archive JSON dump from unpaywall. transform and filter into a
+ TSV with DOI, URL, release-stage columns.
+- filter out previously crawled URLs from this seed file, based on last dump,
+ with the intent of not repeating crawls unnecessarily
+- run heritrix3 crawl, usually by sharding seedlist over multiple machines.
+ after crawl completes:
+ - backfill CDX PDF subset into hbase (for future de-dupe)
+ - generate CRL files etc and upload to archive items
+- run arabesque over complete crawl logs. this takes time, is somewhat manual,
+ and has scaling issues past a few million seeds
+- depending on source/context, run fatcat import with arabesque results
+- periodically run GROBID (and other transforms) over all new harvested files
+
+Issues with this are:
+
+- seedlist generation and arabesque step are toilsome (manual), and arabesque
+ likely has metadata issues or otherwise "leaks" content
+- brozzler pipeline is entirely separate
+- results in re-crawls of content already in wayback, in particular links
+ between large corpuses
+
+New plan:
+
+- download dump, filter, transform into ingest requests (mostly the same as
+ before)
+- load into ingest-request SQL table. only new rows (unique by source, type,
+ and URL) are loaded. run a SQL query for new rows from the source with URLs
+ that have not been ingested
+- (optional) pre-crawl bulk/direct URLs using heritrix3, as before, to reduce
+ later load on SPN
+- run ingest script over the above SQL output. ingest first hits CDX/wayback,
+ and falls back to SPNv2 (brozzler) for "hard" requests, or based on URL.
+ ingest worker handles file metadata, GROBID, any other processing. results go
+ to kafka, then SQL table
+- either do a bulk fatcat import (via join query), or just have workers
+ continuously import into fatcat from kafka ingest feed (with various quality
+ checks)
+
+## Request/Response Schema
+
+For now, plan is to have a single request type, and multiple similar but
+separate result types, depending on the ingest type (file, fileset,
+webcapture). The initial use case is single file PDF ingest.
+
+NOTE: what about crawl requests where we don't know if we will get a PDF or
+HTML? Or both? Let's just recrawl.
+
+*IngestRequest*
+ - `ingest_type`: required, one of `pdf`, `xml`, `html`, `dataset`. For
+ backwards compatibility, `file` should be interpreted as `pdf`. `pdf` and
+ `xml` return file ingest respose; `html` and `dataset` not implemented but
+ would be webcapture (wayback) and fileset (archive.org item or wayback?).
+ In the future: `epub`, `video`, `git`, etc.
+ - `base_url`: required, where to start crawl process
+ - `link_source`: recommended, slug string. indicating the database or "authority"
+ where URL/identifier match is coming from (eg, `doi`, `pmc`, `unpaywall`
+ (doi), `s2` (semantic-scholar id), `spn` (fatcat release), `core` (CORE
+ id), `mag` (MAG id))
+ - `link_source_id`: recommended, identifier string. pairs with `link_source`.
+ - `ingest_request_source`: recommended, slug string. tracks the service or
+ user who submitted request. eg, `fatcat-changelog`, `editor_<ident>`,
+ `savepapernow-web`
+ - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL
+ - `rel`: optional. indicates the link type
+ - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback)
+ - `oa_status`: optional. unpaywall schema
+ - `edit_extra`: additional metadata to be included in any eventual fatcat commits.
+ - `fatcat`
+ - `release_ident`: optional. if provided, indicates that ingest is expected
+ to be fulltext copy of this release (though may be a sibling release
+ under same work if `release_stage` doesn't match)
+ - `work_ident`: optional, unused. might eventually be used if, eg,
+ `release_stage` of ingested file doesn't match that of the `release_ident`
+ - `ext_ids`: matching fatcat schema. used for later lookups. sometimes
+ `link_source` and id are sufficient.
+ - `doi`
+ - `pmcid`
+ - ...
+
+*FileIngestResult*
+ - `request` (object): the full IngestRequest, copied
+ - `status` (slug): 'success', 'error', etc
+ - `hit` (boolean): whether we got something that looks like what was requested
+ - `terminal` (object): last crawled resource (if any)
+ - `terminal_url` (string; formerly `url`)
+ - `terminal_dt` (string): wayback capture datetime (string)
+ - `terminal_status_code`
+ - `terminal_sha1hex`: should match true `file_meta` SHA1 (not necessarily CDX SHA1)
+ (in case of transport encoding difference)
+ - `file_meta` (object): info about the terminal file
+ - same schema as sandcrawler-db table
+ - `size_bytes`
+ - `md5hex`
+ - `sha1hex`
+ - `sha256hex`
+ - `mimetype`: if not know, `application/octet-stream`
+ - `cdx`: CDX record matching terminal resource. *MAY* be a revisit or partial
+ record (eg, if via SPNv2)
+ - same schema as sandcrawler-db table
+ - `revisit_cdx` (optional): if `cdx` is a revisit record, this will be the
+ best "original" location for retrieval of the body (matching `flie_meta`)
+ - same schema as sandcrawler-db table
+ - `grobid`
+ - same schema as sandcrawler-db table
+ - `status` (string)
+ - `status_code` (int)
+ - `grobid_version` (string, from metadata)
+ - `fatcat_release` (string, from metadata)
+ - `metadata` (JSON) (with `grobid_version` and `fatcat_release` removed)
+ - NOT `tei_xml` (strip from reply)
+ - NOT `file_meta` (strip from reply)
+
+In general, it is the `terminal_dt` and `terminal_url` that should be used to
+construct wayback links (eg, for insertion to fatcat), not from the `cdx`.
+
+## New SQL Tables
+
+Sandcrawler should persist status about:
+
+- claimed locations (links) to fulltext copies of in-scope works, from indexes
+ like unpaywall, MAG, semantic scholar, CORE
+ - with enough context to help insert into fatcat if works are crawled and
+ found. eg, external identifier that is indexed in fatcat, and
+ release-stage
+- state of attempting to crawl all such links
+ - again, enough to insert into fatcat
+ - also info about when/how crawl happened, particularly for failures, so we
+ can do retries
+
+Proposing two tables:
+
+ -- source/source_id examples:
+ -- unpaywall / doi
+ -- mag / mag_id
+ -- core / core_id
+ -- s2 / semanticscholar_id
+ -- doi / doi (for any base_url which is just https://doi.org/10..., regardless of why enqueued)
+ -- pmc / pmcid (for any base_url like europmc.org, regardless of why enqueued)
+ -- arxiv / arxiv_id (for any base_url like arxiv.org, regardless of why enqueued)
+ CREATE TABLE IF NOT EXISTS ingest_request (
+ -- conceptually: source, source_id, ingest_type, url
+ -- but we use this order for PRIMARY KEY so we have a free index on type/URL
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
+ link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
+
+ created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ release_stage TEXT CHECK (octet_length(release_stage) >= 1),
+ request JSONB,
+ -- request isn't required, but can stash extra fields there for import, eg:
+ -- ext_ids (source/source_id sometimes enough)
+ -- release_ident (if ext_ids and source/source_id not specific enough; eg SPN)
+ -- edit_extra
+ -- rel
+ -- oa_status
+ -- ingest_request_source TEXT NOT NULL CHECK (octet_length(ingest_request_source) >= 1),
+
+ PRIMARY KEY (ingest_type, base_url, link_source, link_source_id)
+ );
+
+ CREATE TABLE IF NOT EXISTS ingest_file_result (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT
+ terminal_url TEXT, INDEX
+ terminal_dt TEXT
+ terminal_status_code INT
+ terminal_sha1hex TEXT, INDEX
+
+ PRIMARY KEY (ingest_type, base_url)
+ );
+
+## New Kafka Topics
+
+- `sandcrawler-ENV.ingest-file-requests`
+- `sandcrawler-ENV.ingest-file-results`
+
+## Ingest Tool Design
+
+The basics of the ingest tool are to:
+
+- use native wayback python library to do fast/efficient lookups and redirect
+ lookups
+- starting from base-url, do a fetch to either target resource or landing page:
+ follow redirects, at terminus should have both CDX metadata and response body
+ - if no capture, or most recent is too old (based on request param), do
+ SPNv2 (brozzler) fetches before wayback lookups
+- if looking for PDF but got landing page (HTML), try to extract a PDF link
+ from HTML using various tricks, then do another fetch. limit this
+ recursion/spidering to just landing page (or at most one or two additional
+ hops)
+
+Note that if we pre-crawled with heritrix3 (with `citation_pdf_url` link
+following), then in the large majority of simple cases we
+
+## Design Issues
+
+### Open Questions
+
+Do direct aggregator/repositories crawls need to go through this process? Eg
+arxiv.org or pubmed central. I guess so, otherwise how do we get full file
+metadata (size, other hashes)?
+
+When recording hit status for a URL (ingest result), is that status dependent
+on the crawl context? Eg, for save-paper-now we might want to require GROBID.
+Semantics of `hit` should probably be consistent: if we got the filetype
+expected based on type, not whether we would actually import to fatcat.
+
+Where to include knowledge about, eg, single-page abstract PDFs being bogus? Do
+we just block crawling, set an ingest result status, or only filter at fatcat
+import time? Definitely need to filter at fatcat import time to make sure
+things don't slip through elsewhere.
+
+### Yet Another PDF Harvester
+
+This system could result in "yet another" set of publisher-specific heuristics
+and hacks to crawl publicly available papers. Related existing work includes
+[unpaywall's crawler][unpaywall_crawl], LOCKSS extraction code, dissem.in's
+efforts, zotero's bibliography extractor, etc. The "memento tracer" work is
+also similar. Many of these are even in python! It would be great to reduce
+duplicated work and maintenance. An analagous system in the wild is youtube-dl
+for downloading video from many sources.
+
+[unpaywall_crawl]: https://github.com/ourresearch/oadoi/blob/master/webpage.py
+[memento_tracer]: http://tracer.mementoweb.org/
+
+One argument against this would be that our use-case is closely tied to
+save-page-now, wayback, and the CDX API. However, a properly modular
+implementation of a paper downloader would allow components to be re-used, and
+perhaps dependency ingjection for things like HTTP fetches to allow use of SPN
+or similar. Another argument for modularity would be support for headless
+crawling (eg, brozzler).
+
+Note that this is an internal implementation detail; the ingest API would
+abstract all this.
+
+## Test Examples
+
+Some example works that are difficult to crawl. Should have mechanisms to crawl
+and unit tests for all these.
+
+- <https://pubs.acs.org>
+- <https://linkinghub.elsevier.com> / <https://sciencedirect.com>
+- <https://www.osapublishing.org/captcha/?guid=39B0E947-C0FC-B5D8-2C0C-CCF004FF16B8>
+- <https://utpjournals.press/action/cookieAbsent>
+- <https://academic.oup.com/jes/article/3/Supplement_1/SUN-203/5484104>
+- <http://www.jcancer.org/v10p4038.htm>
diff --git a/proposals/2019_pdftotext_pdfinfo.md b/proposals/2019_pdftotext_pdfinfo.md
new file mode 100644
index 0000000..ed731a4
--- /dev/null
+++ b/proposals/2019_pdftotext_pdfinfo.md
@@ -0,0 +1,123 @@
+
+status: brainstorming/backburner
+
+last updated: 2019-12-11
+
+This document proposes changes to extract text and metadata from PDFs at ingest
+time using pdftotext and pdfinfo, and storing this content in SQL and minio.
+
+This isn't a priority at the moment. Could be useful for fulltext search when
+GROBID fails, and the pdfinfo output might help with other quality checks.
+
+## Overview / Motivation
+
+`pdfinfo` and `pdftotext` can both be run quickly over raw PDFs. In
+sandcrawler, fetching PDFs can be a bit slow, so the motivation for caching the
+text is just to not have to fetch the PDFs over and over. Metadata is useful to
+store and index at scale.
+
+## pdfinfo output
+
+Example PDF info outputs:
+
+ Creator: PDF Suite 2010
+ Producer: PDF Suite 2010
+ CreationDate: Tue Sep 24 23:03:58 2013 PDT
+ ModDate: Tue Sep 24 23:03:58 2013 PDT
+ Tagged: no
+ UserProperties: no
+ Suspects: no
+ Form: none
+ JavaScript: no
+ Pages: 17
+ Encrypted: no
+ Page size: 612 x 792 pts (letter)
+ Page rot: 0
+ File size: 105400 bytes
+ Optimized: no
+ PDF version: 1.4
+
+another:
+
+ Title: Miscellanea Zoologica Hungarica 8. 1993 (Budapest, 1993)
+ Author: L. Forró szerk.
+ Producer: ABBYY FineReader 9.0 Corporate Edition
+ CreationDate: Wed Apr 13 05:30:21 2011 PDT
+ ModDate: Wed Apr 13 09:53:27 2011 PDT
+ Tagged: yes
+ UserProperties: no
+ Suspects: no
+ Form: AcroForm
+ JavaScript: no
+ Pages: 13
+ Encrypted: no
+ Page size: 473.76 x 678.42 pts
+ Page rot: 0
+ File size: 12047270 bytes
+ Optimized: no
+ PDF version: 1.6
+
+With the `-meta` flag, you get XML output, which also includes:
+
+ <xmpMM:DocumentID>uuid:cd1a8daa-61e1-48f4-b679-26eac52bb6a9</xmpMM:DocumentID>
+ <xmpMM:InstanceID>uuid:dea54c78-8bc6-4f2f-a665-4cd7e62457e7</xmpMM:InstanceID>
+
+The document id is particularly interesting for fatcat/sandcrawler. Apparently
+it is randomly created (or based on md5?) of first version of the file, and
+persists across edits. A quality check would be that all files with the same
+`document_id` should be clustered under the same fatcat work.
+
+All the info fields could probably be combined and used in categorization and
+filtering (ML or heuristic). Eg, a PDF with forms is probably not research
+output; published PDFs with specific "Producer" software probably are.
+
+## Fatcat Changes
+
+Could include in entity fields, a `pdfinfo` JSONB field, or existing `extra`:
+
+- pages
+- words
+- document id
+- page size
+- created
+- other meta (eg, PDF title, author, etc)
+
+All of these fields are, I assume, deterministic, thus appropriate for
+inclusion in fatcat.
+
+## New SQL Tables
+
+ CREATE TABLE IF NOT EXISTS pdftotext (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ tool_version TEXT CHECK (octet_length(tool_version) >= 1),
+ text_success BOOLEAN NOT NULL,
+ text_words INT,
+ info_success BOOLEAN NOT NULL,
+ pages INT,
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ document_id TEXT CHECK (octet_length(document_id) >= 1), -- XXX: always UUID?
+ metadata JSONB
+ -- metadata contains any other stuff from pdfinfo:
+ -- title
+ -- author
+ -- pdf version
+ -- page size (?)
+ -- instance_id
+ );
+ -- CREATE INDEX pdftotext ON pdftotext(document_id);
+
+## New Kafka Topics
+
+ sandcrawler-ENV.pdftotext-output
+
+Key would be sha1hex of PDF.
+
+Schema would match the SQL table, plus the full raw PDF text output.
+
+## New Minio Stuff
+
+ /pdftotext/<hexbyte0>/<hexbyte1>/<sha1hex>.txt
+
+## Open Questions
+
diff --git a/proposals/20200129_pdf_ingest.md b/proposals/20200129_pdf_ingest.md
new file mode 100644
index 0000000..9469217
--- /dev/null
+++ b/proposals/20200129_pdf_ingest.md
@@ -0,0 +1,272 @@
+
+status: planned
+
+2020q1 Fulltext PDF Ingest Plan
+===================================
+
+This document lays out a plan and tasks for a push on crawling and ingesting
+more fulltext PDF content in early 2020.
+
+The goal is to get the current generation of pipelines and matching tools
+running smoothly by the end of March, when the Mellon phase 1 grant ends. As a
+"soft" goal, would love to see over 25 million papers (works) with fulltext in
+fatcat by that deadline as well.
+
+This document is organized by conceptual approach, then by jobs to run and
+coding tasks needing work.
+
+There is a lot of work here!
+
+
+## Broad OA Ingest By External Identifier
+
+There are a few million papers in fatacat which:
+
+1. have a DOI, arxiv id, or pubmed central id, which can be followed to a
+ landing page or directly to a PDF
+2. are known OA, usually because publication is Gold OA
+3. don't have any fulltext PDF in fatcat
+
+As a detail, some of these "known OA" journals actually have embargos (aka,
+they aren't true Gold OA). In particular, those marked via EZB OA "color", and
+recent pubmed central ids.
+
+Of these, I think there are broadly two categories. The first is just papers we
+haven't tried directly crawling or ingesting yet at all; these should be easy
+to crawl and ingest. The second category is papers from large publishers with
+difficult to crawl landing pages (for example, Elsevier, IEEE, Wiley, ACM). The
+later category will probably not crawl with heritrix, and we are likely to be
+rate-limited or resource constrained when using brozzler or
+
+Coding Tasks:
+
+- improve `fatcat_ingest.py` script to allow more granular slicing and limiting
+ the number of requests enqueued per batch (eg, to allow daily partial
+ big-publisher ingests in random order). Allow dumping arxiv+pmcid ingest
+ requests.
+
+Actions:
+
+- run broad Datacite DOI landing crawl with heritrix ("pre-ingest")
+- after Datacite crawl completes, run arabesque and ingest any PDF hits
+- run broad non-Datacite DOI landing crawl with heritrix. Use ingest tool to
+ generate (or filter a dump), removing Datacite DOIs and large publishers
+- after non-Datacite crawl completes, run entire ingest request set through in
+ bulk mode
+- start enqueing large-publisher (hard to crawl) OA DOIs to ingest queue
+ for SPNv2 crawling (blocking ingest tool improvement, and also SPNv2 health)
+- start new PUBMEDCENTRAL and ARXIV slow-burn pubmed crawls (heritrix). Use
+ updated ingest tool to generate requests.
+
+
+## Large Seedlist Crawl Iterations
+
+We have a bunch of large, high quality seedlists, most of which haven't been
+updated or crawled in a year or two. Some use DOIs as identifiers, some use an
+internal identifier. As a quick summary:
+
+- unpaywall: currently 25 million DOIs (Crossref only?) with fulltext. URLs may
+ be doi.org, publisher landing page, or direct PDF; may be published version,
+ pre-print, or manuscript (indicated with a flag). Only crawled with heritrix;
+ last crawl was Spring 2019. There is a new dump from late 2019 with a couple
+ million new papers/URLs.
+- microsoft academic (MAG): tens of millions of papers, hundreds of millions of
+ URLs. Last crawled 2018 (?) from a 2016 dump. Getting a new full dump via
+ Azure; new dump includes type info for each URL ("pdf", "landing page", etc).
+ Uses MAG id for each URL, not DOI; hoping new dump has better MAG/DOI
+ mappings. Expect a very large crawl (tens of millions of new URLs).
+- CORE: can do direct crawling of PDFs from their site, as well as external
+ URLs. They largely have pre-prints and IR content. Have not released a dump
+ in a long time. Would expect a couple million new direct (core.ac.uk) URLs
+ and fewer new web URLs (often overlap with other lists, like MAG)
+- semantic scholar: they do regular dumps. Use SHA1 hash of PDF as identifier;
+ it's the "best PDF of a group", so not always the PDF you crawl. Host many OA
+ PDFs on their domain, very fast to crawl, as well as wide-web URLs. Their
+ scope has increased dramatically in recent years due to MAG import; expect a
+ lot of overlap there.
+
+It is increasingly important to not
+
+Coding Tasks:
+- transform scripts for all these seedlist sources to create ingest request
+ lists
+- sandcrawler ingest request persist script, which supports setting datetime
+- fix HBase thrift gateway so url agnostic de-dupe can be updated
+- finish ingest worker "skip existing" code path, which looks in sandcrawler-db
+ to see if URL has already been processed (for efficiency)
+
+Actions:
+- transform and persist all these old seedlists, with the URL datetime set to
+ roughly when the URL was added to the upstream corpus
+- transform arabesque output for all old crawls into ingest requests and run
+ through the bulk ingest queue. expect GROBID to be skipped for all these, and
+ for the *requests* not to be updated (SQL ON CONFLICT DO NOTHING). Will
+ update ingest result table with status.
+- fetch new MAG and unpaywall seedlists, transform to ingest requests, persist
+ into ingest request table. use SQL to dump only the *new* URLs (not seen in
+ previous dumps) using the created timestamp, outputing new bulk ingest
+ request lists. if possible, de-dupe between these two. then start bulk
+ heritrix crawls over these two long lists. Probably sharded over several
+ machines. Could also run serially (first one, then the other, with
+ ingest/de-dupe in between). Filter out usual large sites (core, s2, arxiv,
+ pubmed, etc)
+- CORE and Semantic Scholar direct crawls, of only new URLs on their domain
+ (should not significantly conflict/dupe with other bulk crawls)
+
+After this round of big crawls completes we could do iterated crawling of
+smaller seedlists, re-visit URLs that failed to ingest with updated heritrix
+configs or the SPNv2 ingest tool, etc.
+
+
+## GROBID/glutton Matching of Known PDFs
+
+Of the many PDFs in the sandcrawler CDX "working set", many were broadly
+crawled or added via CDX heuristic. In other words, we don't have an identifier
+from a seedlist. We previously run a matching script in Hadoop that attempted
+to link these to Crossref DOIs based on GROBID extracted metadata. We haven't
+done this in a long time; in the meanwhile we have added many more such PDFs,
+added lots of metadata to our matching set (eg, pubmed and arxiv in addition to
+crossref), and have the new biblio-glutton tool for matching, which may work
+better than our old conservative tool.
+
+We have run GROBID+glutton over basically all of these PDFs. We should be able
+to do a SQL query to select PDFs that:
+
+- have at least one known CDX row
+- GROBID processed successfuly and glutton matched to a fatcat release
+- do not have an existing fatcat file (based on sha1hex)
+- output GROBID metadata, `file_meta`, and one or more CDX rows
+
+An update match importer can take this output and create new file entities.
+Then lookup the release and confirm the match to the GROBID metadata, as well
+as any other quality checks, then import into fatcat. We have some existing
+filter code we could use. The verification code should be refactored into a
+reusable method.
+
+It isn't clear to me how many new files/matches we would get from this, but
+could do some test SQL queries to check. At least a million?
+
+A related task is to update the glutton lookup table (elasticsearch index and
+on-disk lookup tables) after more recent metadata imports (Datacite, etc).
+Unsure if we should filter out records or improve matching so that we don't
+match "header" (paper) metadata to non-paper records (like datasets), but still
+allow *reference* matching (citations to datasets).
+
+Coding Tasks:
+- write SQL select function. Optionally, come up with a way to get multiple CDX
+ rows in the output (sub-query?)
+- biblio metadata verify match function (between GROBID metadata and existing
+ fatcat release entity)
+- updated match fatcat importer
+
+Actions:
+- update `fatcat_file` sandcrawler table
+- check how many PDFs this might ammount to. both by uniq SHA1 and uniq
+ `fatcat_release` matches
+- do some manual random QA verification to check that this method results in
+ quality content in fatcat
+- run full updated import
+
+
+## No-Identifier PDF New Release Import Pipeline
+
+Previously, as part of longtail OA crawling work, I took a set of PDFs crawled
+from OA journal homepages (where the publisher does not register DOIs), took
+successful GROBID metadata, filtered for metadata quality, and imported about
+1.5 million new release entities into fatcat.
+
+There were a number of metadata issues with this import that we are still
+cleaning up, eg:
+
+- paper actually did have a DOI and should have been associated with existing
+ fatcat release entity; these PDFs mostly came from repository sites which
+ aggregated many PDFs, or due to unintentional outlink crawl configs
+- no container linkage for any of these releases, making coverage tracking or
+ reporting difficult
+- many duplicates in same import set, due to near-identical PDFs (different by
+ SHA-1, but same content and metadata), not merged or grouped in any way
+
+The cleanup process is out of scope for this document, but we want to do
+another round of similar imports, while avoiding these problems.
+
+As a rouch sketch of what this would look like (may need to iterate):
+
+- filter to PDFs from longtail OA crawls (eg, based on WARC prefix, or URL domain)
+- filter to PDFs not in fatcat already (in sandcrawler, then verify with lookup)
+- filter to PDFs with successful GROBID extraction and *no* glutton match
+- filter/clean GROBID extracted metadata (in python, not SQL), removing stubs
+ or poor/partial extracts
+- run a fuzzy biblio metadata match against fatcat elasticsearch; use match
+ verification routine to check results
+- if fuzzy match was a hit, consider importing directly as a matched file
+ (especially if there are no existing files for the release)
+- identify container for PDF from any of: domain pattern/domain; GROBID
+ extracted ISSN or journal name; any other heuristic
+- if all these filters pass and there was no fuzzy release match, and there was
+ a container match, import a new release (and the file) into fatcat
+
+Not entirely clear how to solve the near-duplicate issue. Randomize import
+order (eg, sort by file sha1), import slowly with a single thread, and ensure
+elasticsearch re-indexing pipeline is running smoothly so the fuzzy match will
+find recently-imported hits?
+
+In theory we could use biblio-glutton API to do the matching lookups, but I
+think it will be almost as fast to hit our own elasticsearch index. Also the
+glutton backing store is always likely to be out of date. In the future we may
+even write something glutton-compatible that hits our index. Note that this is
+also very similar to how citation matching could work, though it might be
+derailing or over-engineering to come up with a single solution for both
+applications at this time.
+
+A potential issue here is that many of these papers are probably already in
+another large but non-authoritative metadata corpus, like MAG, CORE, SHARE, or
+BASE. Importing from those corpuses would want to go through the same fuzzy
+matching to ensure we aren't creating duplicate releases, but further it would
+be nice to be matching those external identifiers for any newly created
+releases. One approach would be to bulk-import metadata from those sources
+first. There are huge numbers of records in those corpuses, so we would need to
+filter down by journal/container or OA flag first. Another would be to do fuzzy
+matching when we *do* end up importing those corpuses, and update these records
+with the external identifiers. This issue really gets at the crux of a bunch of
+design issues and scaling problems with fatcat! But I think we should or need
+to make progress on these longtail OA imports without perfectly solving these
+larger issues.
+
+Details/Questions:
+- what about non-DOI metadata sources like MAG, CORE, SHARE, BASE? Should we
+ import those first, or do fuzzy matching against those?
+- use GROBID language detection and copy results to newly created releases
+- in single-threaded, could cache "recently matched/imported releases" locally
+ to prevent double-importing
+- cache container matching locally
+
+Coding Tasks:
+- write SQL select statement
+- iterate on GROBID metadata cleaning/transform/filter (have existing code for
+ this somewhere)
+- implement a "fuzzy match" routine that takes biblio metadata (eg, GROBID
+ extracted), looks in fatcat elasticsearch for a match
+- implement "fuzzy container match" routine, using as much available info as
+ possible. Could use chocula sqlite locally, or hit elasticsearch container
+ endpoint
+- update GROBID importer to use fuzzy match and other checks
+
+Actions:
+- run SQL select and estimate bounds on number of new releases created
+- do some manual randomized QA runs to ensure this pipeline is importing
+ quality content in fatcat
+- run a full batch import
+
+
+## Non-authoritative Metadata and Fulltext from Aggregators
+
+This is not fully thought through, but at some point we will probably add one
+or more large external aggregator metadata sources (MAG, Semantic Scholar,
+CORE, SHARE, BASE), and bulk import both metadata records and fulltext at the
+same time. The assumption is that those sources are doing the same fuzzy entity
+merging/de-dupe and crawling we are doing, but they have already done it
+(probably with more resources) and created stable identifiers that we can
+include.
+
+A major blocker for most such imports is metadata licensing (fatcat is CC0,
+others have restrictions). This may not be the case for CORE and SHARE though.
diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md
new file mode 100644
index 0000000..31a2db6
--- /dev/null
+++ b/proposals/20200207_pdftrio.md
@@ -0,0 +1,104 @@
+
+status: in progress
+
+PDF Trio (ML Classification)
+==============================
+
+This document describes how we intent to integrate the first generation of PDF
+classification work into the sandcrawler processing system.
+
+- abstractions (APIs)
+- schemas
+- how models and dependencies are deployed
+- what code is release where under what license
+
+
+## Code Structure
+
+Major components:
+
+**Training code, documentation, datasets:** Not used at run-time (does not need
+to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we
+should only release URL lists that point to wayback.
+
+**Models:** all are static, uploaded to archive.org items, simple download to
+deploy. Should be versioned, and have unique versioned file names or directory
+paths (aka, deploy in parallel).
+
+**Image classifier backend:** vanilla tensorflow serving docker image, with a
+bunch of invocation configs, plus static models.
+
+**BERT backend:** vanilla tensorflow serving docker image, plus config, plus
+models. Basically same as image classifier.
+
+**API service:** currently Flask. Depends on tools like imagemagik, fasttext,
+pdftotext. Seems like apt+pipenv should work?
+
+
+## API Refactors
+
+Changes:
+
+- probably re-write README?
+- refactor python code into directories
+- add python tests
+- tweak schema
+- proper parallelization: uwsgi? async?
+
+New features:
+
+- option to send images, raw text in batches in addition to PDFs.
+
+## Client Code
+
+Basically just like GROBID client for now. Requests, JSON.
+
+## JSON Schema
+
+Output that goes in Kafka topic:
+
+ key (sha1hex)
+ pdf_trio
+ status
+ status_code
+ ensemble_score
+ bert_score
+ image_score
+ linear_score
+ versions
+ pdftrio_version (string)
+ models_date (string, ISO date)
+ git_rev (string)
+ bert_model (string)
+ image_model (string)
+ linear_model (string)
+ timing (optional/future: as reported by API)
+ ...
+ file_meta
+ sha1hex
+ ...
+ timing
+ ...
+
+
+## SQL Schema
+
+Ensemble model versions are summarized as a date.
+
+ CREATE TABLE IF NOT EXISTS pdftrio (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
+ models_date DATE,
+ ensemble_score REAL,
+ bert_score REAL,
+ linear_score REAL,
+ image_score REAL
+ );
+
+## Kafka Topic
+
+sandcrawler-qa.pdftrio-output
+
diff --git a/proposals/20200211_nsq.md b/proposals/20200211_nsq.md
new file mode 100644
index 0000000..6aa885b
--- /dev/null
+++ b/proposals/20200211_nsq.md
@@ -0,0 +1,79 @@
+
+status: planned
+
+In short, Kafka is not working well as a job task scheduler, and I want to try
+NSQ as a medium-term solution to that problem.
+
+
+## Motivation
+
+Thinking of setting up NSQ to use for scheduling distributed work, to replace
+kafka for some topics. for example, "regrobid" requests where we enqueue
+millions of, basically, CDX lines, and want to process on dozens of cores or
+multiple machines. or file ingest backfill. results would still go to kafka (to
+persist), and pipelines like DOI harvest -> import -> elasticsearch would still
+be kafka
+
+The pain point with kafka is having dozens of workers on tasks that take more
+than a couple seconds per task. we could keep tweaking kafka and writing weird
+consumer group things to handle this, but I think it will never work very well.
+NSQ supports re-queues with delay (eg, on failure, defer to re-process later),
+allows many workers to connect and leave with no disruption, messages don't
+have to be processed in order, and has a very simple enqueue API (HTTP POST).
+
+The slowish tasks we have now are file ingest (wayback and/or SPNv2 +
+GROBID) and re-GROBID. In the near future will also have ML backlog to go
+through.
+
+Throughput isn't much of a concern as tasks take 10+ seconds each.
+
+
+## Specific Plan
+
+Continue publishing ingest requests to Kafka topic. Have a new persist worker
+consume from this topic and push to request table (but not result table) using
+`ON CONFLICT DO NOTHING`. Have a new single-process kafka consumer pull from
+the topic and push to NSQ. This consumer monitors NSQ and doesn't push too many
+requests (eg, 1k maximum). NSQ could potentially even run as in-memory mode.
+New worker/pusher class that acts as an NSQ client, possibly with parallelism.
+
+*Clean* NSQ shutdown/restart always persists data locally to disk.
+
+Unclean shutdown (eg, power failure) would mean NSQ might have lost state.
+Because we are persisting requests to sandcrawler-db, cleanup is simple:
+re-enqueue all requests from the past N days with null result or result older
+than M days.
+
+Still need multiple kafka and NSQ topics to have priority queues (eg, bulk,
+platform-specific).
+
+To start, have a single static NSQ host; don't need nsqlookupd. Could use
+wbgrp-svc506 (datanode VM with SSD, lots of CPU and RAM).
+
+To move hosts, simply restart the kafka pusher pointing at the new NSQ host.
+When the old host's queue is empty, restart the workers to consume from the new
+host, and destroy the old NSQ host.
+
+
+## Alternatives
+
+Work arounds i've done to date have been using the `grobid_tool.py` or
+`ingest_tool.py` JSON input modes to pipe JSON task files (millions of lines)
+through GNU/parallel. I guess GNU/parallel's distributed mode is also an option
+here.
+
+Other things that could be used:
+
+**celery**: popular, many features. need to run separate redis, no disk persistence (?)
+
+**disque**: need to run redis, no disk persistence (?) <https://github.com/antirez/disque>
+
+**gearman**: <http://gearman.org/> no disk persistence (?)
+
+
+## Old Notes
+
+TBD if would want to switch ingest requests from fatcat -> sandcrawler over,
+and have the continuous ingests run out of NSQ, or keep using kafka for that.
+currently can only do up to 10x parallelism or so with SPNv2, so that isn't a
+scaling pain point
diff --git a/proposals/20201012_no_capture.md b/proposals/20201012_no_capture.md
new file mode 100644
index 0000000..bb47ea2
--- /dev/null
+++ b/proposals/20201012_no_capture.md
@@ -0,0 +1,36 @@
+
+status: in-progress
+
+Storing no-capture missing URLs in `terminal_url`
+=================================================
+
+Currently, when the bulk-mode ingest code terminates with a `no-capture`
+status, the missing URL (which is not in GWB CDX) is not stored in
+sandcrawler-db. This proposed change is to include it in the existing
+`terminal_url` database column, with the `terminal_status_code` and
+`terminal_dt` columns empty.
+
+The implementation is rather simple:
+
+- CDX lookup code path should save the *actual* final missing URL (`next_url`
+ after redirects) in the result object's `terminal_url` field
+- ensure that this field gets passed through all the way to the database on the
+ `no-capture` code path
+
+This change does change the semantics of the `terminal_url` field somewhat, and
+could break existing assumptions, so it is being documented in this proposal
+document.
+
+
+## Alternatives
+
+The current status quo is to store the missing URL as the last element in the
+"hops" field of the JSON structure. We could keep this and have a convoluted
+pipeline that would read from the Kafka feed and extract them, but this would
+be messy. Eg, re-ingesting would not update the old kafka messages, so we could
+need some accounting of consumer group offsets after which missing URLs are
+truely missing.
+
+We could add a new `missing_url` database column and field to the JSON schema,
+for this specific use case. This seems like unnecessary extra work.
+
diff --git a/proposals/20201026_html_ingest.md b/proposals/20201026_html_ingest.md
new file mode 100644
index 0000000..785471b
--- /dev/null
+++ b/proposals/20201026_html_ingest.md
@@ -0,0 +1,129 @@
+
+status: deployed
+
+HTML Ingest Pipeline
+========================
+
+Basic goal: given an ingest request of type 'html', output an object (JSON)
+which could be imported into fatcat.
+
+Should work with things like (scholarly) blog posts, micropubs, registrations,
+protocols. Doesn't need to work with everything to start. "Platform" sites
+(like youtube, figshare, etc) will probably be a different ingest worker.
+
+A current unknown is what the expected size of this metadata is. Both in number
+of documents and amount of metadata per document.
+
+Example HTML articles to start testing:
+
+- complex distill article: <https://distill.pub/2020/bayesian-optimization/>
+- old HTML journal: <http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm>
+- NIH pub: <https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html>
+- first mondays (OJS): <https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729>
+- d-lib: <http://www.dlib.org/dlib/july17/williams/07williams.html>
+
+
+## Ingest Process
+
+Follow base URL to terminal document, which is assumed to be a status=200 HTML document.
+
+Verify that terminal document is fulltext. Extract both metadata and fulltext.
+
+Extract list of sub-resources. Filter out unwanted (eg favicon, analytics,
+unnecessary), apply a sanity limit. Convert to fully qualified URLs. For each
+sub-resource, fetch down to the terminal resource, and compute hashes/metadata.
+
+Open questions:
+
+- will probably want to parallelize sub-resource fetching. async?
+- behavior when failure fetching sub-resources
+
+
+## Ingest Result Schema
+
+JSON should be basically compatible with existing `ingest_file_result` objects,
+with some new sub-objects.
+
+Overall object (`IngestWebResult`):
+
+- `status`: str
+- `hit`: bool
+- `error_message`: optional, if an error
+- `hops`: optional, array of URLs
+- `cdx`: optional; single CDX row of primary HTML document
+- `terminal`: optional; same as ingest result
+ - `terminal_url`
+ - `terminal_dt`
+ - `terminal_status_code`
+ - `terminal_sha1hex`
+- `request`: optional but usually present; ingest request object, verbatim
+- `file_meta`: optional; file metadata about primary HTML document
+- `html_biblio`: optional; extracted biblio metadata from primary HTML document
+- `scope`: optional; detected/guessed scope (fulltext, etc)
+- `html_resources`: optional; array of sub-resources. primary HTML is not included
+- `html_body`: optional; just the status code and some metadata is passed through;
+ actual document would go through a different KafkaTopic
+ - `status`: str
+ - `agent`: str, eg "trafilatura/0.4"
+ - `tei_xml`: optional, str
+ - `word_count`: optional, str
+
+
+## New SQL Tables
+
+`html_meta`
+ sha1hex (primary key)
+ updated (of SQL row)
+ status
+ scope
+ has_teixml
+ has_thumbnail
+ word_count (from teixml fulltext)
+ biblio (JSON)
+ resources (JSON)
+
+Also writes to `ingest_file_result`, `file_meta`, and `cdx`, all only for the base HTML document.
+
+Note: needed to enable postgrest access to this table (for scholar worker).
+
+
+## Fatcat API Wants
+
+Would be nice to have lookup by SURT+timestamp, and/or by sha1hex of terminal base file.
+
+`hide` option for cdx rows; also for fileset equivalent.
+
+
+## New Workers
+
+Could reuse existing worker, have code branch depending on type of ingest.
+
+ingest file worker
+ => same as existing worker, because could be calling SPN
+
+persist result
+ => same as existing worker; adds persisting various HTML metadata
+
+persist html text
+ => talks to seaweedfs
+
+
+## New Kafka Topics
+
+HTML ingest result topic (webcapture-ish)
+
+sandcrawler-ENV.html-teixml
+ JSON wrapping TEI-XML (same as other fulltext topics)
+ key compaction and content compression enabled
+
+JSON schema:
+
+- `key` and `sha1hex`: str; used as kafka key
+- `status`: str
+- `tei_xml`: str, optional
+- `word_count`: int, optional
+
+## New S3/SeaweedFS Content
+
+`sandcrawler` bucket, `html` folder, `.tei.xml` suffix.
+
diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md
new file mode 100644
index 0000000..181cc11
--- /dev/null
+++ b/proposals/20201103_xml_ingest.md
@@ -0,0 +1,81 @@
+
+status: wip
+
+TODO:
+x XML fulltext URL extractor (based on HTML biblio metadata, not PDF url extractor)
+x differential JATS XML and scielo XML from generic XML?
+ application/xml+jats is what fatcat is doing for abstracts
+ but it should be application/jats+xml?
+ application/tei+xml
+ if startswith "<article " and "<article-meta>" => JATS
+x refactor ingest worker to be more general
+x have ingest code publish body to kafka topic
+x write a persist worker
+/ create/configure kafka topic
+- test everything locally
+- fatcat: ingest tool to create requests
+- fatcat: entity updates worker creates XML ingest requests for specific sources
+- fatcat: ingest file import worker allows XML results
+- ansible: deployment of persist worker
+
+XML Fulltext Ingest
+====================
+
+This document details changes to include XML fulltext ingest in the same way
+that we currently ingest PDF fulltext.
+
+Currently this will just fetch the single XML document, which is often lacking
+figures, tables, and other required files.
+
+## Text Encoding
+
+Because we would like to treat XML as a string in a couple contexts, but XML
+can have multiple encodings (indicated in an XML header), we are in a bit of a
+bind. Simply parsing into unicode and then re-encoding as UTF-8 could result in
+a header/content mismatch. Any form of re-encoding will change the hash of the
+document. For recording in fatcat, the file metadata will be passed through.
+For storing in Kafka and blob store (for downstream analysis), we will parse
+the raw XML document (as "bytes") with an XML parser, then re-output with UTF-8
+encoding. The hash of the *original* XML file will be used as the key for
+refering to this document. This is unintuitive, but similar to what we are
+doing with PDF and HTML documents (extracting in a useful format, but keeping
+the original document's hash as a key).
+
+Unclear if we need to do this re-encode process for XML documents already in
+UTF-8 encoding.
+
+## Ingest Worker
+
+Could either re-use HTML metadata extractor to fetch XML fulltext links, or
+fork that code off to a separate method, like the PDF fulltext URL extractor.
+
+Hopefully can re-use almost all of the PDF pipeline code, by making that ingest
+worker class more generic and subclassing it.
+
+Result objects are treated the same as PDF ingest results: the result object
+has context about status, and if successful, file metadata and CDX row of the
+terminal object.
+
+TODO: should it be assumed that XML fulltext will end up in S3 bucket? or
+should there be an `xml_meta` SQL table tracking this, like we have for PDFs
+and HTML?
+
+TODO: should we detect and specify the XML schema better? Eg, indicate if JATS.
+
+
+## Persist Pipeline
+
+### Kafka Topic
+
+sandcrawler-ENV.xml-doc
+ similar to other fulltext topics; JSON wrapping the XML
+ key compaction, content compression
+
+### S3/SeaweedFS
+
+`sandcrawler` bucket, `xml` folder. Extension could depend on sub-type of XML?
+
+### Persist Worker
+
+New S3-only worker that pulls from kafka topic and pushes to S3. Works
+basically the same as PDF persist in S3-only mode, or like pdf-text worker.
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md
new file mode 100644
index 0000000..793d6b5
--- /dev/null
+++ b/proposals/2020_pdf_meta_thumbnails.md
@@ -0,0 +1,328 @@
+
+status: work-in-progress
+
+New PDF derivatives: thumbnails, metadata, raw text
+===================================================
+
+To support scholar.archive.org (fulltext search) and other downstream uses of
+fatcat, want to extract from many PDFs:
+
+- pdf structured metadata
+- thumbnail images
+- raw extracted text
+
+A single worker should extract all of these fields, and publish in to two kafka
+streams. Separate persist workers consume from the streams and push in to SQL
+and/or seaweedfs.
+
+Additionally, this extraction should happen automatically for newly-crawled
+PDFs as part of the ingest pipeline. When possible, checks should be run
+against the existing SQL table to avoid duplication of processing.
+
+
+## PDF Metadata and Text
+
+Kafka topic (name: `sandcrawler-ENV.pdf-text`; 12x partitions; gzip
+compression) JSON schema:
+
+ sha1hex (string; used as key)
+ status (string)
+ text (string)
+ page0_thumbnail (boolean)
+ meta_xml (string)
+ pdf_info (object)
+ pdf_extra (object)
+ word_count
+ file_meta (object)
+ source (object)
+
+For the SQL table we should have columns for metadata fields that are *always*
+saved, and put a subset of other interesting fields in a JSON blob. We don't
+need all metadata fields in SQL. Full metadata/info will always be available in
+Kafka, and we don't want SQL table size to explode. Schema:
+
+ CREATE TABLE IF NOT EXISTS pdf_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ has_page0_thumbnail BOOLEAN NOT NULL,
+ page_count INT CHECK (page_count >= 0),
+ word_count INT CHECK (word_count >= 0),
+ page0_height REAL CHECK (page0_height >= 0),
+ page0_width REAL CHECK (page0_width >= 0),
+ permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
+ metadata JSONB
+ -- maybe some analysis of available fields?
+ -- metadata JSON fields:
+ -- title
+ -- subject
+ -- author
+ -- creator
+ -- producer
+ -- CrossMarkDomains
+ -- doi
+ -- form
+ -- encrypted
+ );
+
+
+## Thumbnail Images
+
+Kafka Schema is raw image bytes as message body; sha1sum of PDF as the key. No
+compression, 12x partitions.
+
+Kafka topic name is `sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE` (eg,
+`sandcrawler-qa.pdf-thumbnail-180px-jpg`). Thus, topic name contains the
+"metadata" of thumbail size/shape.
+
+Have decided to use JPEG thumbnails, 180px wide (and max 300px high, though
+width restriction is almost always the limiting factor). This size matches that
+used on archive.org, and is slightly larger than the thumbnails currently used
+on scholar.archive.org prototype. We intend to tweak the scholar.archive.org
+CSS to use the full/raw thumbnail image at max desktop size. At this size it
+would be difficult (though maybe not impossible?) to extract text (other than
+large-font titles).
+
+
+### Implementation
+
+We use the `poppler` CPP library (wrapper for python) to extract and convert everything.
+
+Some example usage of the `python-poppler` library:
+
+ import poppler
+ from PIL import Image
+
+ pdf = poppler.load_from_file("/home/bnewbold/10.1038@s41551-020-0534-9.pdf")
+ pdf.pdf_id
+ page = pdf.create_page(0)
+ page.page_rect().width
+
+ renderer = poppler.PageRenderer()
+ full_page = renderer.render_page(page)
+ img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "RGBA")
+ img.thumbnail((180,300), Image.BICUBIC)
+ img.save("something.jpg")
+
+## Deployment and Infrastructure
+
+Deployment will involve:
+
+- sandcrawler DB SQL table
+ => guesstimate size 100 GByte for hundreds of PDFs
+- postgrest/SQL access to new table for internal HTTP API hits
+- seaweedfs raw text folder
+ => reuse existing bucket with GROBID XML; same access restrictions on content
+- seaweedfs thumbnail bucket
+ => new bucket for this world-public content
+- public nginx access to seaweed thumbnail bucket
+- extraction work queue kafka topic
+ => same schema/semantics as ungrobided
+- text/metadata kafka topic
+- thumbnail kafka topic
+- text/metadata persist worker(s)
+ => from kafka; metadata to SQL database; text to seaweedfs blob store
+- thumbnail persist worker
+ => from kafka to seaweedfs blob store
+- pdf extraction worker pool
+ => very similar to GROBID worker pool
+- ansible roles for all of the above
+
+Plan for processing/catchup is:
+
+- test with COVID-19 PDF corpus
+- run extraction on all current fatcat files avaiable via IA
+- integrate with ingest pipeline for all new files
+- run a batch catchup job over all GROBID-parsed files with no pdf meta
+ extracted, on basis of SQL table query
+
+## Appendix: Thumbnail Size and Format Experimentation
+
+Using 190 PDFs from `/data/pdfs/random_crawl/files` on my laptop to test.
+
+TODO: actually, 4x images failed to convert with pdftocairo; this throws off
+"mean" sizes by a small amount.
+
+ time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -png {} /tmp/test-png/{}.png
+ real 0m29.314s
+ user 0m26.794s
+ sys 0m2.484s
+ => missing: 4
+ => min: 0.8k
+ => max: 57K
+ => mean: 16.4K
+ => total: 3120K
+
+ time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -jpeg {} /tmp/test-jpeg/{}.jpg
+ real 0m26.289s
+ user 0m24.022s
+ sys 0m2.490s
+ => missing: 4
+ => min: 1.2K
+ => max: 13K
+ => mean: 8.02k
+ => total: 1524K
+
+ time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -jpeg -jpegopt optimize=y,quality=80 {} /tmp/test-jpeg2/{}.jpg
+ real 0m27.401s
+ user 0m24.941s
+ sys 0m2.519s
+ => missing: 4
+ => min: 577
+ => max: 14K
+ => mean:
+ => total: 1540K
+
+ time ls | parallel -j1 convert -resize 200x200 {}[0] /tmp/magick-png/{}.png
+ => missing: 4
+ real 1m19.399s
+ user 1m17.150s
+ sys 0m6.322s
+ => min: 1.1K
+ => max: 325K
+ => mean:
+ => total: 8476K
+
+ time ls | parallel -j1 convert -resize 200x200 {}[0] /tmp/magick-jpeg/{}.jpg
+ real 1m21.766s
+ user 1m17.040s
+ sys 0m7.155s
+ => total: 3484K
+
+NOTE: the following `pdf_thumbnail.py` images are somewhat smaller than the above
+jpg and pngs (max 180px wide, not 200px wide)
+
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-png/{}.png
+ real 0m48.198s
+ user 0m42.997s
+ sys 0m4.509s
+ => missing: 2; 2x additional stub images
+ => total: 5904K
+
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg/{}.jpg
+ real 0m45.252s
+ user 0m41.232s
+ sys 0m4.273s
+ => min: 1.4K
+ => max: 16K
+ => mean: ~9.3KByte
+ => total: 1772K
+
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg-360/{}.jpg
+ real 0m48.639s
+ user 0m44.121s
+ sys 0m4.568s
+ => mean: ~28k
+ => total: 5364K (3x of 180px batch)
+
+ quality=95
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg2-360/{}.jpg
+ real 0m49.407s
+ user 0m44.607s
+ sys 0m4.869s
+ => total: 9812K
+
+ quality=95
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg2-180/{}.jpg
+ real 0m45.901s
+ user 0m41.486s
+ sys 0m4.591s
+ => mean: 16.4K
+ => total: 3116K
+
+At the 180px size, the difference between default and quality=95 seems
+indistinguishable visually to me, but is more than a doubling of file size.
+Also tried at 300px and seems near-indistinguishable there as well.
+
+At a mean of 10 Kbytes per file:
+
+ 10 million -> 100 GBytes
+ 100 million -> 1 Tbyte
+
+Older COVID-19 thumbnails were about 400px wide:
+
+ pdftocairo -png -singlefile -scale-to-x 400 -scale-to-y -1
+
+Display on scholar-qa.archive.org is about 135x181px
+
+archive.org does 180px wide
+
+Unclear if we should try to do double resolution for high DPI screens (eg,
+apple "retina").
+
+Using same size as archive.org probably makes the most sense: max 180px wide,
+preserve aspect ratio. And jpeg improvement seems worth it.
+
+#### Merlijn notes
+
+From work on optimizing microfilm thumbnail images:
+
+ When possible, generate a thumbnail that fits well on the screen of the
+ user. Always creating a large thumbnail will result in the browsers
+ downscaling them, leading to fuzzy text. If it’s not possible, then create
+ the pick the resolution you’d want to support (1.5x or 2x scaling) and
+ create thumbnails of that size, but also apply the other recommendations
+ below - especially a sharpening filter.
+
+ Use bicubic or lanczos interpolation. Bilinear and nearest neighbour are
+ not OK.
+
+ For text, consider applying a sharpening filter. Not a strong one, but some
+ sharpening can definitely help.
+
+
+## Appendix: PDF Info Fields
+
+From `pdfinfo` manpage:
+
+ The ´Info' dictionary contains the following values:
+
+ title
+ subject
+ keywords
+ author
+ creator
+ producer
+ creation date
+ modification date
+
+ In addition, the following information is printed:
+
+ tagged (yes/no)
+ form (AcroForm / XFA / none)
+ javascript (yes/no)
+ page count
+ encrypted flag (yes/no)
+ print and copy permissions (if encrypted)
+ page size
+ file size
+ linearized (yes/no)
+ PDF version
+ metadata (only if requested)
+
+For an example file, the output looks like:
+
+ Title: A mountable toilet system for personalized health monitoring via the analysis of excreta
+ Subject: Nature Biomedical Engineering, doi:10.1038/s41551-020-0534-9
+ Keywords:
+ Author: Seung-min Park
+ Creator: Springer
+ CreationDate: Thu Mar 26 01:26:57 2020 PDT
+ ModDate: Thu Mar 26 01:28:06 2020 PDT
+ Tagged: no
+ UserProperties: no
+ Suspects: no
+ Form: AcroForm
+ JavaScript: no
+ Pages: 14
+ Encrypted: no
+ Page size: 595.276 x 790.866 pts
+ Page rot: 0
+ File size: 6104749 bytes
+ Optimized: yes
+ PDF version: 1.4
+
+For context on the `pdf_id` fields ("original" and "updated"), read:
+<https://web.hypothes.is/blog/synchronizing-annotations-between-local-and-remote-pdfs/>
diff --git a/proposals/2020_seaweed_s3.md b/proposals/2020_seaweed_s3.md
new file mode 100644
index 0000000..5f4ff0b
--- /dev/null
+++ b/proposals/2020_seaweed_s3.md
@@ -0,0 +1,426 @@
+# Notes on seaweedfs
+
+> 2020-04-28, martin@archive.org
+
+Currently (04/2020) [minio](https://github.com/minio/minio) is used to store
+output from PDF analysis for [fatcat](https://fatcat.wiki) (e.g. from
+[grobid](https://grobid.readthedocs.io/en/latest/)). The file checksum (sha1)
+serves as key, values are blobs of XML or JSON.
+
+Problem: minio inserts slowed down after inserting 80M or more objects.
+
+Summary: I did four test runs, three failed, one (testrun-4) succeeded.
+
+* [testrun-4](https://git.archive.org/webgroup/sandcrawler/-/blob/master/proposals/2020_seaweed_s3.md#testrun-4)
+
+So far, in a non-distributed mode, the project looks usable. Added 200M objects
+(about 550G) in 6 days. Full CPU load, 400M RAM usage, constant insert times.
+
+----
+
+Details (03/2020) / @bnewbold, slack
+
+> the sandcrawler XML data store (currently on aitio) is grinding to a halt, I
+> think because despite tuning minio+ext4+hdd just doesn't work. current at 2.6
+> TiB of data (each document compressed with snappy) and 87,403,183 objects.
+
+> this doesn't impact ingest processing (because content is queued and archived
+> in kafka), but does impact processing and analysis
+
+> it is possible that the other load on aitio is making this worse, but I did
+> an experiment with dumping to a 16 TB disk that slowed way down after about
+> 50 million files also. some people on the internet said to just not worry
+> about these huge file counts on modern filesystems, but i've debugged a bit
+> and I think it is a bad idea after all
+
+Possible solutions
+
+* putting content in fake WARCs and trying to do something like CDX
+* deploy CEPH object store (or swift, or any other off-the-shelf object store)
+* try putting the files in postgres tables, mongodb, cassandra, etc: these are
+ not designed for hundreds of millions of ~50 KByte XML documents (5 - 500
+ KByte range)
+* try to find or adapt an open source tool like Haystack, Facebook's solution
+ to this engineering problem. eg:
+ https://engineering.linkedin.com/blog/2016/05/introducing-and-open-sourcing-ambry---linkedins-new-distributed-
+
+----
+
+The following are notes gathered during a few test runs of seaweedfs in 04/2020
+on wbgrp-svc170.us.archive.org (4 core E5-2620 v4, 4GB RAM).
+
+----
+
+## Setup
+
+There are frequent [releases](https://github.com/chrislusf/seaweedfs/releases)
+but for the test, we used a build off master branch.
+
+Directions for configuring AWS CLI for seaweedfs:
+[https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS](https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS).
+
+### Build the binary
+
+Using development version (requires a [Go installation](https://golang.org/dl/)).
+
+```
+$ git clone git@github.com:chrislusf/seaweedfs.git # 11f5a6d9
+$ cd seaweedfs
+$ make
+$ ls -lah weed/weed
+-rwxr-xr-x 1 tir tir 55M Apr 17 16:57 weed
+
+$ git rev-parse HEAD
+11f5a6d91346e5f3cbf3b46e0a660e231c5c2998
+
+$ sha1sum weed/weed
+a7f8f0b49e6183da06fc2d1411c7a0714a2cc96b
+```
+
+A single, 55M binary emerges after a few seconds. The binary contains
+subcommands to run different parts of seaweed, e.g. master or volume servers,
+filer and commands for maintenance tasks, like backup and compaction.
+
+To *deploy*, just copy this binary to the destination.
+
+### Quickstart with S3
+
+Assuming `weed` binary is in PATH.
+
+Start a master and volume server (over /tmp, most likely) and the S3 API with a single command:
+
+```
+$ weed -server s3
+...
+Start Seaweed Master 30GB 1.74 at 0.0.0.0:9333
+...
+Store started on dir: /tmp with 0 volumes max 7
+Store started on dir: /tmp with 0 ec shards
+Volume server start with seed master nodes: [localhost:9333]
+...
+Start Seaweed S3 API Server 30GB 1.74 at http port 8333
+...
+```
+
+Install the [AWS
+CLI](https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS).
+Create a bucket.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 mb s3://sandcrawler-dev
+make_bucket: sandcrawler-dev
+```
+
+List buckets.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 ls
+2020-04-17 17:44:39 sandcrawler-dev
+```
+
+Create a dummy file.
+
+```
+$ echo "blob" > 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml
+```
+
+Upload.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 cp 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml s3://sandcrawler-dev
+upload: ./12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml to s3://sandcrawler-dev/12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml
+```
+
+List.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 ls s3://sandcrawler-dev
+2020-04-17 17:50:35 5 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml
+```
+
+Stream to stdout.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 cp s3://sandcrawler-dev/12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml -
+blob
+```
+
+Drop the bucket.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 rm --recursive s3://sandcrawler-dev
+```
+
+### Builtin benchmark
+
+The project comes with a builtin benchmark command.
+
+```
+$ weed benchmark
+```
+
+I encountered an error like
+[#181](https://github.com/chrislusf/seaweedfs/issues/181), "no free volume
+left" - when trying to start the benchmark after the S3 ops. A restart or a restart with `-volume.max 100` helped.
+
+```
+$ weed server -s3 -volume.max 100
+```
+
+### Listing volumes
+
+```
+$ weed shell
+> volume.list
+Topology volume:15/112757 active:8 free:112742 remote:0 volumeSizeLimit:100 MB
+ DataCenter DefaultDataCenter volume:15/112757 active:8 free:112742 remote:0
+ Rack DefaultRack volume:15/112757 active:8 free:112742 remote:0
+ DataNode localhost:8080 volume:15/112757 active:8 free:112742 remote:0
+ volume id:1 size:105328040 collection:"test" file_count:33933 version:3 modified_at_second:1587215730
+ volume id:2 size:106268552 collection:"test" file_count:34236 version:3 modified_at_second:1587215730
+ volume id:3 size:106290280 collection:"test" file_count:34243 version:3 modified_at_second:1587215730
+ volume id:4 size:105815368 collection:"test" file_count:34090 version:3 modified_at_second:1587215730
+ volume id:5 size:105660168 collection:"test" file_count:34040 version:3 modified_at_second:1587215730
+ volume id:6 size:106296488 collection:"test" file_count:34245 version:3 modified_at_second:1587215730
+ volume id:7 size:105753288 collection:"test" file_count:34070 version:3 modified_at_second:1587215730
+ volume id:8 size:7746408 file_count:12 version:3 modified_at_second:1587215764
+ volume id:9 size:10438760 collection:"test" file_count:3363 version:3 modified_at_second:1587215788
+ volume id:10 size:10240104 collection:"test" file_count:3299 version:3 modified_at_second:1587215788
+ volume id:11 size:10258728 collection:"test" file_count:3305 version:3 modified_at_second:1587215788
+ volume id:12 size:10240104 collection:"test" file_count:3299 version:3 modified_at_second:1587215788
+ volume id:13 size:10112840 collection:"test" file_count:3258 version:3 modified_at_second:1587215788
+ volume id:14 size:10190440 collection:"test" file_count:3283 version:3 modified_at_second:1587215788
+ volume id:15 size:10112840 collection:"test" file_count:3258 version:3 modified_at_second:1587215788
+ DataNode localhost:8080 total size:820752408 file_count:261934
+ Rack DefaultRack total size:820752408 file_count:261934
+ DataCenter DefaultDataCenter total size:820752408 file_count:261934
+total size:820752408 file_count:261934
+```
+
+### Custom S3 benchmark
+
+To simulate the use case of S3 for 100-500M small files (grobid xml, pdftotext,
+...), I created a synthetic benchmark.
+
+* [https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b](https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b)
+
+We just try to fill up the datastore with millions of 5k blobs.
+
+----
+
+### testrun-1
+
+Small set, just to run. Status: done. Learned that the default in-memory volume
+index grows too quickly for the 4GB RAM machine.
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-1 -s3 -volume.max 512 -master.volumeSizeLimitMB 100
+```
+
+* https://github.com/chrislusf/seaweedfs/issues/498 -- RAM
+* at 10M files, we already consume ~1G
+
+```
+-volume.index string
+ Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance. (default "memory")
+```
+
+### testrun-2
+
+200M 5k objects, in-memory volume index. Status: done. Observed: After 18M
+objects the 512 100MB volumes are exhausted and seaweedfs will not accept any
+new data.
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-2 -s3 -volume.max 512 -master.volumeSizeLimitMB 100
+...
+I0418 12:01:43 1622 volume_loading.go:104] loading index /tmp/martin-seaweedfs-testrun-2/test_511.idx to memory
+I0418 12:01:43 1622 store.go:122] add volume 511
+I0418 12:01:43 1622 volume_layout.go:243] Volume 511 becomes writable
+I0418 12:01:43 1622 volume_growth.go:224] Created Volume 511 on topo:DefaultDataCenter:DefaultRack:localhost:8080
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to master@[::1]:45084: url:"localhost:8080" public_url:"localhost:8080" new_vids:511
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to filer@::1:18888: url:"localhost:8080" public_url:"localhost:8080" new_vids:511
+I0418 12:01:43 1622 store.go:118] In dir /tmp/martin-seaweedfs-testrun-2 adds volume:512 collection:test replicaPlacement:000 ttl:
+I0418 12:01:43 1622 volume_loading.go:104] loading index /tmp/martin-seaweedfs-testrun-2/test_512.idx to memory
+I0418 12:01:43 1622 store.go:122] add volume 512
+I0418 12:01:43 1622 volume_layout.go:243] Volume 512 becomes writable
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to master@[::1]:45084: url:"localhost:8080" public_url:"localhost:8080" new_vids:512
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to filer@::1:18888: url:"localhost:8080" public_url:"localhost:8080" new_vids:512
+I0418 12:01:43 1622 volume_growth.go:224] Created Volume 512 on topo:DefaultDataCenter:DefaultRack:localhost:8080
+I0418 12:01:43 1622 node.go:82] topo failed to pick 1 from 0 node candidates
+I0418 12:01:43 1622 volume_growth.go:88] create 7 volume, created 2: No enough data node found!
+I0418 12:04:30 1622 volume_layout.go:231] Volume 511 becomes unwritable
+I0418 12:04:30 1622 volume_layout.go:231] Volume 512 becomes unwritable
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+I0418 12:04:30 1622 filer_server_handlers_write.go:120] fail to allocate volume for /buckets/test/k43731970, collection:test, datacenter:
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+I0418 12:04:30 1622 masterclient.go:88] filer failed to receive from localhost:9333: rpc error: code = Unavailable desc = transport is closing
+I0418 12:04:30 1622 master_grpc_server.go:276] - client filer@::1:18888
+```
+
+Inserted about 18M docs, then:
+
+```
+worker-0 @3720000 45475.13 81.80
+worker-1 @3730000 45525.00 81.93
+worker-3 @3720000 45525.76 81.71
+worker-4 @3720000 45527.22 81.71
+Process Process-1:
+Traceback (most recent call last):
+ File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
+ self.run()
+ File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
+ self._target(*self._args, **self._kwargs)
+ File "s3test.py", line 42, in insert_keys
+ s3.Bucket(bucket).put_object(Key=key, Body=data)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/boto3/resources/factory.py", line 520, in do_action
+ response = action(self, *args, **kwargs)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/boto3/resources/action.py", line 83, in __call__
+ response = getattr(parent.meta.client, operation_name)(**params)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/botocore/client.py", line 316, in _api_call
+ return self._make_api_call(operation_name, kwargs)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/botocore/client.py", line 626, in _make_api_call
+ raise error_class(parsed_response, operation_name)
+botocore.exceptions.ClientError: An error occurred (InternalError) when calling the PutObject operation (reached max retries: 4): We encountered an internal error, please try again.
+
+real 759m30.034s
+user 1962m47.487s
+sys 105m21.113s
+```
+
+Sustained 400 S3 puts/s, RAM usage 41% of a 4G machine. 56G on disk.
+
+> No free volumes left! Failed to allocate bucket for /buckets/test/k163721819
+
+### testrun-3
+
+* use leveldb, leveldbLarge
+* try "auto" volumes
+* Status: done. Observed: rapid memory usage increase.
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-3 -s3 -volume.max 0 -volume.index=leveldbLarge -filer=false -master.volumeSizeLimitMB 100
+```
+
+Observations: memory usage grows rapidly, soon at 15%.
+
+Note-to-self: [https://github.com/chrislusf/seaweedfs/wiki/Optimization](https://github.com/chrislusf/seaweedfs/wiki/Optimization)
+
+### testrun-4
+
+The default volume size is 30G (and cannot be more at the moment), and RAM
+grows very much with the number of volumes. Therefore, keep default volume size
+and do not limit number of volumes `-volume.max 0` and do not use in-memory
+index (rather leveldb)
+
+Status: done, 200M object upload via Python script sucessfully in about 6 days,
+memory usage was at a moderate 400M (~10% of RAM). Relatively constant
+performance at about 400 `PutObject` requests/s (over 5 threads, each thread
+was around 80 requests/s; then testing with 4 threads, each thread got to
+around 100 requests/s).
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-4 -s3 -volume.max 0 -volume.index=leveldb
+```
+
+The test script command was (40M files per worker, 5 workers).
+
+```
+$ time python s3test.py -n 40000000 -w 5 2> s3test.4.log
+...
+
+real 8454m33.695s
+user 21318m23.094s
+sys 1128m32.293s
+```
+
+The test script adds keys from `k0...k199999999`.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 ls s3://test | head -20
+2020-04-19 09:27:13 5000 k0
+2020-04-19 09:27:13 5000 k1
+2020-04-19 09:27:13 5000 k10
+2020-04-19 09:27:15 5000 k100
+2020-04-19 09:27:26 5000 k1000
+2020-04-19 09:29:15 5000 k10000
+2020-04-19 09:47:49 5000 k100000
+2020-04-19 12:54:03 5000 k1000000
+2020-04-20 20:14:10 5000 k10000000
+2020-04-22 07:33:46 5000 k100000000
+2020-04-22 07:33:46 5000 k100000001
+2020-04-22 07:33:46 5000 k100000002
+2020-04-22 07:33:46 5000 k100000003
+2020-04-22 07:33:46 5000 k100000004
+2020-04-22 07:33:46 5000 k100000005
+2020-04-22 07:33:46 5000 k100000006
+2020-04-22 07:33:46 5000 k100000007
+2020-04-22 07:33:46 5000 k100000008
+2020-04-22 07:33:46 5000 k100000009
+2020-04-20 20:14:10 5000 k10000001
+```
+
+Glance at stats.
+
+```
+$ du -hs /tmp/martin-seaweedfs-testrun-4
+596G /tmp/martin-seaweedfs-testrun-4
+
+$ find . /tmp/martin-seaweedfs-testrun-4 | wc -l
+5104
+
+$ ps --pid $(pidof weed) -o pid,tid,class,stat,vsz,rss,comm
+ PID TID CLS STAT VSZ RSS COMMAND
+32194 32194 TS Sl+ 1966964 491644 weed
+
+$ ls -1 /proc/$(pidof weed)/fd | wc -l
+192
+
+$ free -m
+ total used free shared buff/cache available
+Mem: 3944 534 324 39 3086 3423
+Swap: 4094 27 4067
+```
+
+### Note on restart
+
+When stopping (CTRL-C) and restarting `weed` it will take about 10 seconds to
+get the S3 API server back up, but another minute or two, until seaweedfs
+inspects all existing volumes and indices.
+
+In that gap, requests to S3 will look like internal server errors.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 cp s3://test/k100 -
+download failed: s3://test/k100 to - An error occurred (500) when calling the
+GetObject operation (reached max retries: 4): Internal Server Error
+```
+
+### Read benchmark
+
+Reading via command line `aws` client is a bit slow at first sight (3-5s).
+
+```
+$ time aws --endpoint-url http://localhost:8333 s3 cp s3://test/k123456789 -
+ppbhjgzkrrgwagmjsuwhqcwqzmefybeopqz [...]
+
+real 0m5.839s
+user 0m0.898s
+sys 0m0.293s
+```
+
+#### Single process random reads
+
+* via [s3read.go](https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b#file-s3read-go)
+
+Running 1000 random reads takes 49s.
+
+#### Concurrent random reads
+
+* 80000 request with 8 parallel processes: 7m41.973968488s, so about 170 objects/s)
+* seen up to 760 keys/s reads for 8 workers
+* weed will utilize all cores, so more cpus could result in higher read throughput
+* RAM usage can increase (seen up to 20% of 4G RAM), then descrease (GC) back to 5%, depending on query load
diff --git a/proposals/2021-04-22_crossref_db.md b/proposals/2021-04-22_crossref_db.md
new file mode 100644
index 0000000..bead7a4
--- /dev/null
+++ b/proposals/2021-04-22_crossref_db.md
@@ -0,0 +1,86 @@
+
+status: work-in-progress
+
+Crossref DOI Metadata in Sandcrawler DB
+=======================================
+
+Proposal is to have a local copy of Crossref API metadata records in
+sandcrawler DB, accessible by simple key lookup via postgrest.
+
+Initial goal is to include these in scholar work "bundles" (along with
+fulltext, etc), in particular as part of reference extraction pipeline. Around
+late 2020, many additional references became available via Crossref records,
+and have not been imported (updated) into fatcat. Reference storage in fatcat
+API is a scaling problem we would like to put off, so injecting content in this
+way is desirable.
+
+To start, working with a bulk dump made available by Crossref. In the future,
+might persist the daily feed to that we have a continuously up-to-date copy.
+
+Another application of Crossref-in-bundles is to identify overall scale of
+changes since initial Crossref metadata import.
+
+
+## Sandcrawler DB Schema
+
+The "updated" field in this case refers to the upstream timestamp, not the
+sandcrawler database update time.
+
+ CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+ );
+
+For postgrest access, may need to also:
+
+ GRANT SELECT ON public.crossref TO web_anon;
+
+## SQL Backfill Command
+
+For an example file:
+
+ cat sample.json \
+ | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');"
+
+For a full snapshot:
+
+ zcat crossref_public_data_file_2021_01.json.gz \
+ | pv -l \
+ | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');"
+
+jq is the bottleneck (100% of a single CPU core).
+
+## Kafka Worker
+
+Pulls from the fatcat crossref ingest Kafka feed and persists into the crossref
+table.
+
+## SQL Table Disk Utilization
+
+An example backfill from early 2021, with about 120 million Crossref DOI
+records.
+
+Starting database size (with ingest running):
+
+ Filesystem Size Used Avail Use% Mounted on
+ /dev/vdb1 1.7T 896G 818G 53% /1
+
+ Size: 475.14G
+
+Ingest SQL command took:
+
+ 120M 15:06:08 [2.22k/s]
+ COPY 120684688
+
+After database size:
+
+ Filesystem Size Used Avail Use% Mounted on
+ /dev/vdb1 1.7T 1.2T 498G 71% /1
+
+ Size: 794.88G
+
+So about 320 GByte of disk.
diff --git a/proposals/2021-09-09_component_ingest.md b/proposals/2021-09-09_component_ingest.md
new file mode 100644
index 0000000..09dee4f
--- /dev/null
+++ b/proposals/2021-09-09_component_ingest.md
@@ -0,0 +1,114 @@
+
+File Ingest Mode: 'component'
+=============================
+
+A new ingest type for downloading individual files which are a subset of a
+complete work.
+
+Some publishers now assign DOIs to individual figures, supplements, and other
+"components" of an over release or document.
+
+Initial mimetypes to allow:
+
+- image/jpeg
+- image/tiff
+- image/png
+- image/gif
+- audio/mpeg
+- video/mp4
+- video/mpeg
+- text/plain
+- text/csv
+- application/json
+- application/xml
+- application/pdf
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-rar
+- application/x-7z-compressed
+- application/x-tar
+- application/vnd.ms-powerpoint
+- application/vnd.ms-excel
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+
+Intentionally not supporting:
+
+- text/html
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'component' ingest.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'component' crawls.
+
+
+## Examples
+
+Hundreds of thousands: <https://fatcat.wiki/release/search?q=type%3Acomponent+in_ia%3Afalse>
+
+#### ACS Supplement File
+
+<https://doi.org/10.1021/acscatal.0c02627.s002>
+
+Redirects directly to .zip in browser. SPN is blocked by cookie check.
+
+#### Frontiers .docx Supplement
+
+<https://doi.org/10.3389/fpls.2019.01642.s001>
+
+Redirects to full article page. There is a pop-up for figshare, seems hard to process.
+
+#### Figshare Single FIle
+
+<https://doi.org/10.6084/m9.figshare.13646972.v1>
+
+As 'component' type in fatcat.
+
+Redirects to a landing page. Dataset ingest seems more appropriate for this entire domain.
+
+#### PeerJ supplement file
+
+<https://doi.org/10.7717/peerj.10257/supp-7>
+
+PeerJ is hard because it redirects to a single HTML page, which has links to
+supplements in the HTML. Perhaps a custom extractor will work.
+
+#### eLife
+
+<https://doi.org/10.7554/elife.38407.010>
+
+The current crawl mechanism makes it seemingly impossible to extract a specific
+supplement from the document as a whole.
+
+#### Zookeys
+
+<https://doi.org/10.3897/zookeys.895.38576.figure53>
+
+These are extract-able.
+
+#### OECD PDF Supplement
+
+<https://doi.org/10.1787/f08c6324-en>
+<https://www.oecd-ilibrary.org/trade/imports-of-services-billions-of-us-dollars_f08c6324-en>
+
+Has an Excel (.xls) link, great, but then paywall.
+
+#### Direct File Link
+
+<https://doi.org/10.1787/888934207500>
+
+This one is also OECD, but is a simple direct download.
+
+#### Protein Data Base (PDB) Entry
+
+<https://doi.org/10.2210/pdb6ls2/pdb>
+
+Multiple files; dataset/fileset more appropriate for these.
diff --git a/proposals/2021-09-13_src_ingest.md b/proposals/2021-09-13_src_ingest.md
new file mode 100644
index 0000000..470827a
--- /dev/null
+++ b/proposals/2021-09-13_src_ingest.md
@@ -0,0 +1,53 @@
+
+File Ingest Mode: 'src'
+=======================
+
+Ingest type for "source" of works in document form. For example, tarballs of
+LaTeX source and figures, as published on arxiv.org and Pubmed Central.
+
+For now, presumption is that this would be a single file (`file` entity in
+fatcat).
+
+Initial mimetypes to allow:
+
+- text/x-tex
+- application/xml
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-tar
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'src' ingest.
+
+Might keep ingest disabled on the fatcat side, at least initially. Eg, until
+there is some scope of "file scope", or other ways of treating 'src' tarballs
+separate from PDFs or other fulltext formats.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'src' crawls.
+
+
+## Examples
+
+ arxiv:2109.00954v1
+ fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy
+ https://arxiv.org/format/2109.00954v1
+ https://arxiv.org/e-print/2109.00954v1
+
+ arxiv:1912.03397v2
+ https://arxiv.org/format/1912.03397v2
+ https://arxiv.org/e-print/1912.03397v2
+ NOT: https://arxiv.org/pdf/1912.03397v2
+
+ pmcid:PMC3767916
+ https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz
+
+For PMC, will need to use one of the .csv file lists to get the digit prefixes.
diff --git a/proposals/schema_changes.sql b/proposals/schema_changes.sql
new file mode 100644
index 0000000..e18d051
--- /dev/null
+++ b/proposals/schema_changes.sql
@@ -0,0 +1,40 @@
+
+-- file_meta: more NOT NULL
+CREATE TABLE IF NOT EXISTS file_meta (
+ sha1hex TEXT NOT NULL PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT NOT NULL CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT NOT NULL CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT NOT NULL,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+
+-- CDX: add domain/host columns?
+CREATE TABLE IF NOT EXISTS cdx (
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_csize BIGINT,
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ domain TEXT NOT NULL CHECK (octet_length(domain) >= 1),
+ host TEXT NOT NULL CHECK (octet_length(host) >= 1),
+ PRIMARY KEY(url, datetime)
+);
+CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+
+-- direct fast import with just md5hex; big UPDATE via join with file_meta
+CREATE TABLE IF NOT EXISTS shadow (
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
+ sha1hex TEXT CHECK (octet_length(sha1hex) = 40),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(shadow_corpus, shadow_id)
+);
+CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
diff --git a/python/.coveragerc b/python/.coveragerc
new file mode 100644
index 0000000..67053a7
--- /dev/null
+++ b/python/.coveragerc
@@ -0,0 +1,5 @@
+[run]
+omit = tests/*
+source =
+ sandcrawler
+ grobid2json
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000..6aa37b6
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+select = C,E,F,W,ANN
+ignore = F405,F403,W503,E231,E203,E501,E226,E711,E713,ANN101,ANN204,ANN102
+max-complexity = 20
+exclude = .git,__pycache__,.venv
+max-line-length = 120
diff --git a/python/.gitignore b/python/.gitignore
new file mode 100644
index 0000000..0d6b987
--- /dev/null
+++ b/python/.gitignore
@@ -0,0 +1,11 @@
+*part-000*
+*.tar.gz
+*.gz
+htmlcov/
+samples/
+
+!.flake8
+!.gitlab-ci.yml
+!.pylintrc
+!.coveragerc
+!.gitignore
diff --git a/mapreduce/.pylintrc b/python/.pylintrc
index 78e9e7f..387bca1 100644
--- a/mapreduce/.pylintrc
+++ b/python/.pylintrc
@@ -1,5 +1,6 @@
[MESSAGES CONTROL]
-disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck
+# TODO: should re-enable some of these
+disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck,unused-wildcard-import,no-member,cyclic-import,too-few-public-methods,wildcard-import,too-many-locals,too-many-ancestors,unused-import
[REPORTS]
output-format=colorized
@@ -8,3 +9,6 @@ include-ids=yes
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,DELETEME
+
+[TYPECHECK]
+extension-pkg-whitelist=selectolax,pydantic,responses
diff --git a/python/Makefile b/python/Makefile
new file mode 100644
index 0000000..4ef99f5
--- /dev/null
+++ b/python/Makefile
@@ -0,0 +1,32 @@
+
+SHELL = /bin/bash
+.SHELLFLAGS = -o pipefail -c
+
+.PHONY: help
+help: ## Print info about all commands
+ @echo "Commands:"
+ @echo
+ @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: deps
+deps: ## Install dependencies using pipenv
+ pipenv install --dev
+
+.PHONY: lint
+lint: ## Run lints (eg, flake8, mypy)
+ #pipenv run flake8 . --exit-zero
+ pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero
+ pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
+ pipenv run pylint --rcfile=.pylintrc -E --jobs=4 sandcrawler tests/*.py *.py
+
+.PHONY: fmt
+fmt: ## Run code formating on all source code
+ pipenv run black *.py sandcrawler/ tests/
+
+.PHONY: test
+test: ## Run all tests and lints
+ pipenv run pytest
+
+.PHONY: coverage
+coverage: ## Run all tests with coverage
+ pipenv run pytest --cov --cov-report=term --cov-report=html
diff --git a/python/Pipfile b/python/Pipfile
new file mode 100644
index 0000000..d5797a5
--- /dev/null
+++ b/python/Pipfile
@@ -0,0 +1,65 @@
+[[source]]
+name = "ia"
+url = "https://devpi.archive.org/wb/prod"
+verify_ssl = true
+
+[[source]]
+name = "pypi"
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = ">=4"
+pytest-pythonpath = "*"
+pytest-pylint = "*"
+responses = ">=0.10"
+pytest-cov = "*"
+pytest-mock = "*"
+pylint = "*"
+ipython = "*"
+mypy = "*"
+flake8 = "*"
+flake8-annotations = "*"
+
+# pytype is failing to install on xenial VMs
+#pytype = "*"
+
+[packages]
+requests = ">=2"
+raven = {extras = ['flask'],version = "*"}
+confluent-kafka = "*"
+python-snappy = "*"
+boto3 = "*"
+minio = "<7.0.0"
+psycopg2 = "*"
+bs4 = "*"
+python-magic = "*"
+ftfy = "*"
+internetarchive = "*"
+Flask = ">=1"
+urlcanon = "*"
+pillow = ">=3"
+python-poppler = ">=0.2.1"
+selectolax = ">=0.2"
+trafilatura = "*"
+pydantic = ">=1.7"
+dateparser = "*"
+braveblock = "*"
+dynaconf = ">=3"
+sentry-sdk = { version = ">=0.14.0", extras = [] }
+zstandard = "*"
+
+# must lock black to an exact version because it is still "beta"
+# see: https://github.com/psf/black/issues/517
+black = "==19.10b0"
+
+[requires]
+python_version = "3.8"
+
+[packages.globalwayback]
+version = ">=0.6.5"
+index = "ia"
+
+[packages.wayback]
+version = ">=0.6.3"
+index = "ia"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
new file mode 100644
index 0000000..a2d675c
--- /dev/null
+++ b/python/Pipfile.lock
@@ -0,0 +1,1515 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "361eab78e7de1efee28a2888133eef7600c80f0f15ad3bfc9e689565979c34a0"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.8"
+ },
+ "sources": [
+ {
+ "name": "ia",
+ "url": "https://devpi.archive.org/wb/prod",
+ "verify_ssl": true
+ },
+ {
+ "name": "pypi",
+ "url": "https://pypi.python.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "appdirs": {
+ "hashes": [
+ "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41",
+ "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"
+ ],
+ "version": "==1.4.4"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
+ "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
+ ],
+ "version": "==20.3.0"
+ },
+ "backports.csv": {
+ "hashes": [
+ "sha256:1277dfff73130b2e106bf3dd347adb3c5f6c4340882289d88f31240da92cbd6d",
+ "sha256:21f6e09bab589e6c1f877edbc40277b65e626262a86e69a70137db714eaac5ce"
+ ],
+ "version": "==1.0.7"
+ },
+ "beautifulsoup4": {
+ "hashes": [
+ "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
+ "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
+ "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
+ ],
+ "version": "==4.9.3"
+ },
+ "black": {
+ "hashes": [
+ "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b",
+ "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"
+ ],
+ "index": "ia",
+ "version": "==19.10b0"
+ },
+ "blinker": {
+ "hashes": [
+ "sha256:471aee25f3992bd325afa3772f1063dbdbbca947a041b8b89466dc00d606f8b6"
+ ],
+ "version": "==1.4"
+ },
+ "boto3": {
+ "hashes": [
+ "sha256:b5052144034e490358c659d0e480c17a4e604fd3aee9a97ddfe6e361a245a4a5",
+ "sha256:efd6c96c98900e9fbf217f13cb58f59b793e51f69a1ce61817eefd31f17c6ef5"
+ ],
+ "index": "ia",
+ "version": "==1.16.55"
+ },
+ "botocore": {
+ "hashes": [
+ "sha256:760d0c16c1474c2a46e3fa45e33ae7457b5cab7410737ab1692340ade764cc73",
+ "sha256:b34327d84b3bb5620fb54603677a9a973b167290c2c1e7ab69c4a46b201c6d46"
+ ],
+ "version": "==1.19.55"
+ },
+ "braveblock": {
+ "hashes": [
+ "sha256:016d7874104e68019e47e5ff57f7ccc1529f0d3bd47756be41d963e354fc5840",
+ "sha256:06a77976a4e6132d5809a29ad8469da6463514495dd7f5122e04f35a363fd2db",
+ "sha256:077098ca6cb904ee863a5001adf49947cdf08d017c52e4e91ce9ebada24ef8e5",
+ "sha256:725665b61627b4cd3a4a1b37a9e3fb5d2f2c3e59413a5b17db0836e1eca08b04",
+ "sha256:82fb24db4e94a5f0536d44864f451fe1c42678b71394c26fa812f73671f17570",
+ "sha256:9fe746a0738e9e5da0347dc2703bb39b5b20d005e6e9d2c3e0c57a7a1eb7dafa",
+ "sha256:b636faf7388236fb90ce7938e6a64c7e1f76de25f27e57289b0b3788cf1ecbc6",
+ "sha256:da7d44e910b18ed83a34d50842fbaeaa6f3000dee329e79e4889987af4f38fb0",
+ "sha256:e4cb86afe64ffa0dc44dc2a7aeca2b7af7a250c56b1b08c32d7966c06c3e6498"
+ ],
+ "index": "ia",
+ "version": "==0.1.10"
+ },
+ "brotli": {
+ "hashes": [
+ "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
+ "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
+ "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
+ "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
+ "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
+ "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
+ "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
+ "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
+ "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
+ "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
+ "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
+ "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
+ "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
+ "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
+ "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
+ "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
+ "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
+ "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
+ "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
+ "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
+ "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
+ "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
+ "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
+ "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
+ "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
+ ],
+ "version": "==1.0.9"
+ },
+ "bs4": {
+ "hashes": [
+ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+ ],
+ "index": "ia",
+ "version": "==0.0.1"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
+ "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
+ ],
+ "version": "==2020.12.5"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
+ "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.0"
+ },
+ "click": {
+ "hashes": [
+ "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
+ "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
+ ],
+ "version": "==7.1.2"
+ },
+ "configparser": {
+ "hashes": [
+ "sha256:005c3b102c96f4be9b8f40dafbd4997db003d07d1caa19f37808be8031475f2a",
+ "sha256:08e8a59ef1817ac4ed810bb8e17d049566dd6e024e7566f6285c756db2bb4ff8"
+ ],
+ "version": "==5.0.1"
+ },
+ "confluent-kafka": {
+ "hashes": [
+ "sha256:00acc73f7d49961bf427f5e4fd6c0a220a6bfa5ccc91e0ad1f9ffa1751a169b0",
+ "sha256:0a59afbb90bdd22b9acdd3bb134f5ee1dff3cc5df55eaf52bf97b2f8d0d00de3",
+ "sha256:13b0e2011560f461ff39daf38089dd7f91404b3e66dba0456ccce0700f93c4f2",
+ "sha256:175c7064c8f19975616974558c45f42c147a202d4b1c0b0a83afefb920367696",
+ "sha256:22d7201d1aa89f1c5546749e781492925ed3eb0d7bd8f781fc57294cd45ddde3",
+ "sha256:3034cacc3b0d03eb3ce39cc5a64c1070d223870246f5d90c9113996be9db7df8",
+ "sha256:3e2d4f55ca952aeada3831d6615dc13a8a42c8e97175855ca08bbc6e6091b080",
+ "sha256:5a1c47320d6afc5b2599f8f8e143aed6845a2d903facde984606e02f10f11221",
+ "sha256:7b03bd9cc7b5e4df0a27eed359762c61a35313d4981ef1d9b418069eee454e66",
+ "sha256:85ff4823770ce2efaabb46d88e5ae26a840e0051fd481abaa805f21a5a84d003",
+ "sha256:9534cd2c0313df75b70eb4cf729382998970d97bbdda5cf3aef7081b855ccebe",
+ "sha256:99b13d0957a5967c85aee6138ef5f9acec90294267a549c5683744f20cf5d7b4",
+ "sha256:9a1c77291c1ac4b991aa0358f2f44636686eb8f52fb628502d30c312160a14e9",
+ "sha256:9ac812006000887f76c95b8a33a9f0b65845bf072fbc54a42a1acffd34e41120",
+ "sha256:9c47b8aacfe347bffd86bf75b98626718912b63df87f256dff1abc06a0355410",
+ "sha256:a116382ae67e0d6a54684bab4ee9b1be54e789d031a6e5e74c3edc657c79d23c",
+ "sha256:b1c89f3653385acc5da71570e03281f35ac6960367f2b2a426ae431deb1a1a35",
+ "sha256:bb77276d569f511abe4a5b32a53f8a30285bc7be68219e5711a44720bf356ac2",
+ "sha256:bbd9633552840ab9367fb762ea21272759db8caec2c34ff16ee28be177644cdf",
+ "sha256:bfdfa81e4e72d2c24e408a5e199aae0a477499ae40647dfa6906d002d9b07f38",
+ "sha256:c7461d6db081c23a6d38ceba348e7c178d7e974cf22c45ba8a4918ecb8855a44",
+ "sha256:d6a5d4c72360a75e875e88f7cce42b66a786d037ca2002303ab1c580d49caf53",
+ "sha256:dabed41cc60d1fc6d3cb44a90fe02e5192c9bf0f73c7b35761981e62ecabc592",
+ "sha256:dd544847c713eeeb525031348ff6ffea4ecdd11c13590893e599a9d4676a9bd4",
+ "sha256:eba169a9de8c978c9f33c763857c5279eceac46a4fd55a381c2528b9d4b3359e",
+ "sha256:f2d1ee0bfdf618017bbfaa42406546155c1a86263e4f286295318578c723803b"
+ ],
+ "index": "ia",
+ "version": "==1.5.0"
+ },
+ "contextlib2": {
+ "hashes": [
+ "sha256:01f490098c18b19d2bd5bb5dc445b2054d2fa97f09a4280ba2c5f3c394c8162e",
+ "sha256:3355078a159fbb44ee60ea80abd0d87b80b78c248643b49aa6d94673b413609b"
+ ],
+ "version": "==0.6.0.post1"
+ },
+ "courlan": {
+ "hashes": [
+ "sha256:785426268d3f8cc88089cdac287b70ac64c03ec3ce227bb6fc4d41b2c749b4b9",
+ "sha256:8267058da4b851dcd159d9f49e11c44cb3351b25326708cf405d7ebd0339fea3"
+ ],
+ "version": "==0.3.0"
+ },
+ "crawllib": {
+ "hashes": [
+ "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
+ ],
+ "version": "==0.1.4.8"
+ },
+ "cssselect": {
+ "hashes": [
+ "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
+ "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
+ ],
+ "version": "==1.1.0"
+ },
+ "dateparser": {
+ "hashes": [
+ "sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a",
+ "sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8"
+ ],
+ "index": "ia",
+ "version": "==1.0.0"
+ },
+ "dawg": {
+ "hashes": [
+ "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753",
+ "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce",
+ "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e",
+ "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f",
+ "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac",
+ "sha256:a5a0ae005de5095d53139895d71d09d78a613f8884583a34725b177fd53ada29",
+ "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e",
+ "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b",
+ "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36",
+ "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6"
+ ],
+ "version": "==0.8.0"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
+ "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ ],
+ "version": "==4.4.2"
+ },
+ "docopt": {
+ "hashes": [
+ "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"
+ ],
+ "version": "==0.6.2"
+ },
+ "dogpile.cache": {
+ "hashes": [
+ "sha256:bc9dde1ffa5de0179efbcdc73773ef0553921130ad01955422f2932be35c059e"
+ ],
+ "version": "==0.9.2"
+ },
+ "dynaconf": {
+ "hashes": [
+ "sha256:808adfe964f10695846dbf8dad7632e47fc3bc38860fd1887ed57dddffc4eff2",
+ "sha256:9b34ab2f811a81755f5eb4beac77a69e1e0887528c7e37fc4bc83fed52dcf502"
+ ],
+ "index": "ia",
+ "version": "==3.1.2"
+ },
+ "elasticsearch": {
+ "hashes": [
+ "sha256:4ebd34fd223b31c99d9f3b6b6236d3ac18b3046191a37231e8235b06ae7db955",
+ "sha256:a725dd923d349ca0652cf95d6ce23d952e2153740cf4ab6daf4a2d804feeed48"
+ ],
+ "version": "==7.10.1"
+ },
+ "filelock": {
+ "hashes": [
+ "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59",
+ "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"
+ ],
+ "version": "==3.0.12"
+ },
+ "flask": {
+ "hashes": [
+ "sha256:4efa1ae2d7c9865af48986de8aeb8504bf32c7f3d6fdc9353d34b21f4b127060",
+ "sha256:8a4fdd8936eba2512e9c85df320a37e694c93945b33ef33c89946a340a238557"
+ ],
+ "index": "ia",
+ "version": "==1.1.2"
+ },
+ "ftfy": {
+ "hashes": [
+ "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720"
+ ],
+ "index": "ia",
+ "version": "==5.8"
+ },
+ "globalwayback": {
+ "hashes": [
+ "sha256:53a8d0ffbe417dba94cf87dfe080b6ecc7961cc0658ad30ee684e4ccb0c9e7ed"
+ ],
+ "index": "ia",
+ "version": "==0.6.9"
+ },
+ "htmldate": {
+ "hashes": [
+ "sha256:602b778042c69459f1a5515e619172880da7837a8c8eb1034bf7eb4e837a6c0b",
+ "sha256:b500fcea9dd8310de083490c88c0cb47424a2aa14fcd42ebd1493e95d8102da7"
+ ],
+ "version": "==0.7.3"
+ },
+ "ialib": {
+ "hashes": [
+ "sha256:0b1745e512266fd6c91af68763f2f8427eec6c92c5009fc75c50d9352fc764fc"
+ ],
+ "version": "==0.5.1.1"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "internetarchive": {
+ "hashes": [
+ "sha256:0e9b24577086283280a5089b3e65086640b4e42d61ca4af913639f45b02b9e4c",
+ "sha256:bf28ab57939a80a61c2cf66bb7173ea1989013494dab564c99035574d5b4faea"
+ ],
+ "index": "ia",
+ "version": "==1.9.6"
+ },
+ "itsdangerous": {
+ "hashes": [
+ "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
+ "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
+ ],
+ "version": "==1.1.0"
+ },
+ "jinja2": {
+ "hashes": [
+ "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
+ "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
+ ],
+ "version": "==2.11.2"
+ },
+ "jmespath": {
+ "hashes": [
+ "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
+ "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
+ ],
+ "version": "==0.10.0"
+ },
+ "jsonpatch": {
+ "hashes": [
+ "sha256:da3831be60919e8c98564acfc1fa918cb96e7c9750b0428388483f04d0d1c5a7",
+ "sha256:e930adc932e4d36087dbbf0f22e1ded32185dfb20662f2e3dd848677a5295a14"
+ ],
+ "version": "==1.28"
+ },
+ "jsonpointer": {
+ "hashes": [
+ "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
+ "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
+ ],
+ "version": "==2.0"
+ },
+ "justext": {
+ "hashes": [
+ "sha256:330035dfaaa960465276afa1836dfb6e63791011a8dfc6da2757142cc4d14d54",
+ "sha256:4b8b7f0749e8725f0089ebe0239c1a45286d61bf507b3f05d136c2700dea4aa6"
+ ],
+ "version": "==2.2.0"
+ },
+ "lxml": {
+ "hashes": [
+ "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
+ "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
+ "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
+ "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
+ "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
+ "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
+ "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
+ "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
+ "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
+ "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
+ "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
+ "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
+ "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
+ "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
+ "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
+ "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
+ "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
+ "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
+ "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
+ "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
+ "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
+ "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
+ "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
+ "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
+ "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
+ "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
+ "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
+ "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
+ "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
+ "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
+ "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
+ "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
+ "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
+ "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
+ "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
+ "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
+ "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==4.6.2"
+ },
+ "markupsafe": {
+ "hashes": [
+ "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
+ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
+ "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
+ "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+ "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
+ "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
+ "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
+ "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
+ "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
+ "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
+ "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+ "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
+ "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+ "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
+ "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
+ "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
+ "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
+ "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
+ "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
+ "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
+ "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
+ "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
+ "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
+ "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
+ "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
+ "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
+ "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
+ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
+ "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
+ "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
+ "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
+ "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
+ "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
+ ],
+ "version": "==1.1.1"
+ },
+ "minio": {
+ "hashes": [
+ "sha256:7cb075b56bac894551304cb824f958069a84e0dd2d0a685f9bed3c05e15727bf",
+ "sha256:acae9bfae0aec1b92025bd63e18135ebb4994c84600716c5323e14cb0c9a0b03",
+ "sha256:eec4ab073ff979c34e928e532d8acc1d40d61ba4404709cf27ab3ecdcfa2a561"
+ ],
+ "index": "ia",
+ "version": "==6.0.2"
+ },
+ "pathspec": {
+ "hashes": [
+ "sha256:86379d6b86d75816baba717e64b1a3a3469deb93bb76d613c9ce79edc5cb68fd",
+ "sha256:aa0cb481c4041bf52ffa7b0d8fa6cd3e88a2ca4879c533c9153882ee2556790d"
+ ],
+ "version": "==0.8.1"
+ },
+ "pillow": {
+ "hashes": [
+ "sha256:02d2ae1d7c311e6fa038abda0843683ae652c9292d723270c85deeb04a1001a8",
+ "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b",
+ "sha256:1937c9e17f685fe6c360dd96ddb8f93f159ac721939ccbfc91a62d8124a29945",
+ "sha256:1ce73edaeb49af9ebeacfb8c58428ae39592839d3a7a16ef3926773f1c8ff8ee",
+ "sha256:2052f0372123c98497ee3294f4e20347d87b9f70d9c65ce2fc520b9339aa8465",
+ "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe",
+ "sha256:26b4a2bcdf0e674505fcd2f1a882f29a99339ac3b5a8e7997b90ede2995434e2",
+ "sha256:282e069f92e43047b34bbd995a8800669af11d038db571758708ebcd96462964",
+ "sha256:2ee30463cbc8e60cba92722a3a64881a0d3df534a047e299a9bcf62ea34fd061",
+ "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b",
+ "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608",
+ "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
+ "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
+ "sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c",
+ "sha256:482feca2305feef9d5c38bae66734c64d7d3e649e5b8e01115894ad6d399bad1",
+ "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
+ "sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8",
+ "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
+ "sha256:5bcea0df97fe0b911a6629aab0997b98e8811561c27167266758a7ede173123e",
+ "sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b",
+ "sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f",
+ "sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51",
+ "sha256:699ad2e1a865433f89c7be40fed71d2497da525dc9938218ac3f222a464ea32a",
+ "sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3",
+ "sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2",
+ "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
+ "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05",
+ "sha256:82351254350d9212bccf71d387efae8ad8f6f4b904d095546a77852a6b16e05b",
+ "sha256:95236f64904157256254b6cc8e29feecd9ee6985732dcb36c9f58d7dabe081d0",
+ "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64",
+ "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7",
+ "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842",
+ "sha256:9e274583a0eab0b6d227139146e28f74488cfbc0d262c4ba2e5c0998b9c498d2",
+ "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a",
+ "sha256:a10befeb7b9975d7c3d2ca3eaf0cb505db98fe50874130e182c2a6f7a606591f",
+ "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3",
+ "sha256:b55f62882d8db466fcf2228422bf3147617744888bf0cf6dffb3254a52eb316b",
+ "sha256:bf83901c158ad92e77e990f51531434e5a96c6aef805a84b6e3bfe825f4d4d0c",
+ "sha256:c32f99a0c7c5313b2df78399ef908563b319de23bec0cc89f1d04c37be19eb2d",
+ "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984",
+ "sha256:cf4b3c634b317ee247c3add4375b0a6bdc45eb0c12a5d7fbf9bfd47ec10b020f",
+ "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3",
+ "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c",
+ "sha256:e04df3808d6202dd552c837c824796899c09ff0ff9c335607904e31f9d387110",
+ "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb",
+ "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0",
+ "sha256:eae3711a7916eb5ec800dfb6963da09db0ada63c0481639dd0ddc0b505883a02",
+ "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
+ "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
+ ],
+ "index": "ia",
+ "version": "==3.1.1"
+ },
+ "psycopg2": {
+ "hashes": [
+ "sha256:00195b5f6832dbf2876b8bf77f12bdce648224c89c880719c745b90515233301",
+ "sha256:068115e13c70dc5982dfc00c5d70437fe37c014c808acce119b5448361c03725",
+ "sha256:17d50b4966818e09e7221f2d64667a7a2fbb43cff6210d6fb6236a16fe8fc622",
+ "sha256:26e7fd115a6db75267b325de0fba089b911a4a12ebd3d0b5e7acb7028bc46821",
+ "sha256:2c93d4d16933fea5bbacbe1aaf8fa8c1348740b2e50b3735d1b0bf8154cbf0f3",
+ "sha256:56007a226b8e95aa980ada7abdea6b40b75ce62a433bd27cec7a8178d57f4051",
+ "sha256:56fee7f818d032f802b8eed81ef0c1232b8b42390df189cab9cfa87573fe52c5",
+ "sha256:6a3d9efb6f36f1fe6aa8dbb5af55e067db802502c55a9defa47c5a1dad41df84",
+ "sha256:6c237e85e534045ea8e9a49ba57fe1c362b564aaac4940083fef7e74c6bf64cc",
+ "sha256:7a02112996b0dd47a5a2b13c67b301284ebcc68ce7f4881d1f97f8598fe6f1f5",
+ "sha256:7e82d44fc5327d0e6b0f7428bc572a645e6cfa8647860ce1da0d262e548ad921",
+ "sha256:a49833abfdede8985ba3f3ec641f771cca215479f41523e99dace96d5b8cce2a",
+ "sha256:ad2fe8a37be669082e61fb001c185ffb58867fdbb3e7a6b0b0d2ffe232353a3e",
+ "sha256:b8cae8b2f022efa1f011cc753adb9cbadfa5a184431d09b273fb49b4167561ad",
+ "sha256:d160744652e81c80627a909a0e808f3c6653a40af435744de037e3172cf277f5",
+ "sha256:d5062ae50b222da28253059880a871dc87e099c25cb68acf613d9d227413d6f7",
+ "sha256:f155cf65726e4afc2316028fcc5791a1bf384cf2c96562b8b97f18c1fb64f272",
+ "sha256:f22ea9b67aea4f4a1718300908a2fb62b3e4276cf00bd829a97ab5894af42ea3",
+ "sha256:f974c96fca34ae9e4f49839ba6b78addf0346777b46c4da27a7bf54f48d3057d",
+ "sha256:fb23f6c71107c37fd667cb4ea363ddeb936b348bbd6449278eb92c189699f543"
+ ],
+ "index": "ia",
+ "version": "==2.8.6"
+ },
+ "publicsuffix": {
+ "hashes": [
+ "sha256:22ce1d65ab6af5e9b2122e2443facdb93fb5c4abf24138099cb10fe7989f43b6"
+ ],
+ "version": "==1.1.1"
+ },
+ "pydantic": {
+ "hashes": [
+ "sha256:025bf13ce27990acc059d0c5be46f416fc9b293f45363b3d19855165fee1874f",
+ "sha256:185e18134bec5ef43351149fe34fda4758e53d05bb8ea4d5928f0720997b79ef",
+ "sha256:213125b7e9e64713d16d988d10997dabc6a1f73f3991e1ff8e35ebb1409c7dc9",
+ "sha256:24ca47365be2a5a3cc3f4a26dcc755bcdc9f0036f55dcedbd55663662ba145ec",
+ "sha256:38be427ea01a78206bcaf9a56f835784afcba9e5b88fbdce33bbbfbcd7841229",
+ "sha256:475f2fa134cf272d6631072554f845d0630907fce053926ff634cc6bc45bf1af",
+ "sha256:514b473d264671a5c672dfb28bdfe1bf1afd390f6b206aa2ec9fed7fc592c48e",
+ "sha256:59e45f3b694b05a69032a0d603c32d453a23f0de80844fb14d55ab0c6c78ff2f",
+ "sha256:5b24e8a572e4b4c18f614004dda8c9f2c07328cb5b6e314d6e1bbd536cb1a6c1",
+ "sha256:6e3874aa7e8babd37b40c4504e3a94cc2023696ced5a0500949f3347664ff8e2",
+ "sha256:8d72e814c7821125b16f1553124d12faba88e85405b0864328899aceaad7282b",
+ "sha256:a4143c8d0c456a093387b96e0f5ee941a950992904d88bc816b4f0e72c9a0009",
+ "sha256:b2b054d095b6431cdda2f852a6d2f0fdec77686b305c57961b4c5dd6d863bf3c",
+ "sha256:c59ea046aea25be14dc22d69c97bee629e6d48d2b2ecb724d7fe8806bf5f61cd",
+ "sha256:d1fe3f0df8ac0f3a9792666c69a7cd70530f329036426d06b4f899c025aca74e",
+ "sha256:d8df4b9090b595511906fa48deda47af04e7d092318bfb291f4d45dfb6bb2127",
+ "sha256:dba5c1f0a3aeea5083e75db9660935da90216f8a81b6d68e67f54e135ed5eb23",
+ "sha256:e682f6442ebe4e50cb5e1cfde7dda6766fb586631c3e5569f6aa1951fd1a76ef",
+ "sha256:ecb54491f98544c12c66ff3d15e701612fc388161fd455242447083350904730",
+ "sha256:f5b06f5099e163295b8ff5b1b71132ecf5866cc6e7f586d78d7d3fd6e8084608",
+ "sha256:f6864844b039805add62ebe8a8c676286340ba0c6d043ae5dea24114b82a319e",
+ "sha256:ffd180ebd5dd2a9ac0da4e8b995c9c99e7c74c31f985ba090ee01d681b1c4b95"
+ ],
+ "index": "ia",
+ "version": "==1.7.3"
+ },
+ "pylru": {
+ "hashes": [
+ "sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
+ ],
+ "version": "==1.2.0"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
+ "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
+ ],
+ "version": "==2.8.1"
+ },
+ "python-magic": {
+ "hashes": [
+ "sha256:356efa93c8899047d1eb7d3eb91e871ba2f5b1376edbaf4cc305e3c872207355",
+ "sha256:b757db2a5289ea3f1ced9e60f072965243ea43a2221430048fd8cacab17be0ce"
+ ],
+ "index": "ia",
+ "version": "==0.4.18"
+ },
+ "python-poppler": {
+ "hashes": [
+ "sha256:6843398adc9c290035646c4cf3c7bfcea9c8e04390bb9cd8fdc9bd063fb77880"
+ ],
+ "index": "ia",
+ "version": "==0.2.2"
+ },
+ "python-snappy": {
+ "hashes": [
+ "sha256:131edc7701d3def8b72ef6cd61ab491d9efd7b92976dbababb424d74b9c7b180",
+ "sha256:168a98d3f597b633cfeeae7fe1c78a8dfd81f018b866cf7ce9e4c56086af891a",
+ "sha256:1ddc688d2164f3b99f1c7caf1cf137d70af201fe97fd727c5dbe7ec92ac0f1e6",
+ "sha256:2efb42b0fcfa77c361a13951393b33cca60c02144b857e919d26aa0778c44994",
+ "sha256:71472441b63c0d5afbb9b81a19c0dafbaf073d8ab1bbfc70450ec81d3c4b86dc",
+ "sha256:7ef899f2704784032a1f020e983b29ef5c519203ffd0d5a17ff5e1f751b8dba6",
+ "sha256:87f5994aef0a1f1fde01904421106c9006015d97e7b13aa72998f5a942093603"
+ ],
+ "index": "ia",
+ "version": "==0.6.0"
+ },
+ "pytz": {
+ "hashes": [
+ "sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4",
+ "sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"
+ ],
+ "version": "==2020.5"
+ },
+ "pyyaml": {
+ "hashes": [
+ "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
+ "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
+ "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
+ "sha256:6034f55dab5fea9e53f436aa68fa3ace2634918e8b5994d82f3621c04ff5ed2e",
+ "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
+ "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
+ "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
+ "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
+ "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
+ "sha256:ad9c67312c84def58f3c04504727ca879cb0013b2517c85a9a253f0cb6380c0a",
+ "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
+ "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
+ "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
+ ],
+ "version": "==5.3.1"
+ },
+ "raven": {
+ "extras": [
+ "flask"
+ ],
+ "hashes": [
+ "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
+ "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
+ ],
+ "index": "ia",
+ "version": "==6.10.0"
+ },
+ "readability-lxml": {
+ "hashes": [
+ "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305",
+ "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"
+ ],
+ "version": "==0.8.1"
+ },
+ "redis": {
+ "hashes": [
+ "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
+ "sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"
+ ],
+ "version": "==3.5.3"
+ },
+ "regex": {
+ "hashes": [
+ "sha256:02951b7dacb123d8ea6da44fe45ddd084aa6777d4b2454fa0da61d569c6fa538",
+ "sha256:0d08e71e70c0237883d0bef12cad5145b84c3705e9c6a588b2a9c7080e5af2a4",
+ "sha256:1862a9d9194fae76a7aaf0150d5f2a8ec1da89e8b55890b1786b8f88a0f619dc",
+ "sha256:1ab79fcb02b930de09c76d024d279686ec5d532eb814fd0ed1e0051eb8bd2daa",
+ "sha256:1fa7ee9c2a0e30405e21031d07d7ba8617bc590d391adfc2b7f1e8b99f46f444",
+ "sha256:262c6825b309e6485ec2493ffc7e62a13cf13fb2a8b6d212f72bd53ad34118f1",
+ "sha256:2a11a3e90bd9901d70a5b31d7dd85114755a581a5da3fc996abfefa48aee78af",
+ "sha256:2c99e97d388cd0a8d30f7c514d67887d8021541b875baf09791a3baad48bb4f8",
+ "sha256:3128e30d83f2e70b0bed9b2a34e92707d0877e460b402faca908c6667092ada9",
+ "sha256:38c8fd190db64f513fe4e1baa59fed086ae71fa45083b6936b52d34df8f86a88",
+ "sha256:3bddc701bdd1efa0d5264d2649588cbfda549b2899dc8d50417e47a82e1387ba",
+ "sha256:4902e6aa086cbb224241adbc2f06235927d5cdacffb2425c73e6570e8d862364",
+ "sha256:49cae022fa13f09be91b2c880e58e14b6da5d10639ed45ca69b85faf039f7a4e",
+ "sha256:56e01daca75eae420bce184edd8bb341c8eebb19dd3bce7266332258f9fb9dd7",
+ "sha256:5862975b45d451b6db51c2e654990c1820523a5b07100fc6903e9c86575202a0",
+ "sha256:6a8ce43923c518c24a2579fda49f093f1397dad5d18346211e46f134fc624e31",
+ "sha256:6c54ce4b5d61a7129bad5c5dc279e222afd00e721bf92f9ef09e4fae28755683",
+ "sha256:6e4b08c6f8daca7d8f07c8d24e4331ae7953333dbd09c648ed6ebd24db5a10ee",
+ "sha256:717881211f46de3ab130b58ec0908267961fadc06e44f974466d1887f865bd5b",
+ "sha256:749078d1eb89484db5f34b4012092ad14b327944ee7f1c4f74d6279a6e4d1884",
+ "sha256:7913bd25f4ab274ba37bc97ad0e21c31004224ccb02765ad984eef43e04acc6c",
+ "sha256:7a25fcbeae08f96a754b45bdc050e1fb94b95cab046bf56b016c25e9ab127b3e",
+ "sha256:83d6b356e116ca119db8e7c6fc2983289d87b27b3fac238cfe5dca529d884562",
+ "sha256:8b882a78c320478b12ff024e81dc7d43c1462aa4a3341c754ee65d857a521f85",
+ "sha256:8f6a2229e8ad946e36815f2a03386bb8353d4bde368fdf8ca5f0cb97264d3b5c",
+ "sha256:9801c4c1d9ae6a70aeb2128e5b4b68c45d4f0af0d1535500884d644fa9b768c6",
+ "sha256:a15f64ae3a027b64496a71ab1f722355e570c3fac5ba2801cafce846bf5af01d",
+ "sha256:a3d748383762e56337c39ab35c6ed4deb88df5326f97a38946ddd19028ecce6b",
+ "sha256:a63f1a07932c9686d2d416fb295ec2c01ab246e89b4d58e5fa468089cab44b70",
+ "sha256:b2b1a5ddae3677d89b686e5c625fc5547c6e492bd755b520de5332773a8af06b",
+ "sha256:b2f4007bff007c96a173e24dcda236e5e83bde4358a557f9ccf5e014439eae4b",
+ "sha256:baf378ba6151f6e272824b86a774326f692bc2ef4cc5ce8d5bc76e38c813a55f",
+ "sha256:bafb01b4688833e099d79e7efd23f99172f501a15c44f21ea2118681473fdba0",
+ "sha256:bba349276b126947b014e50ab3316c027cac1495992f10e5682dc677b3dfa0c5",
+ "sha256:c084582d4215593f2f1d28b65d2a2f3aceff8342aa85afd7be23a9cad74a0de5",
+ "sha256:d1ebb090a426db66dd80df8ca85adc4abfcbad8a7c2e9a5ec7513ede522e0a8f",
+ "sha256:d2d8ce12b7c12c87e41123997ebaf1a5767a5be3ec545f64675388970f415e2e",
+ "sha256:e32f5f3d1b1c663af7f9c4c1e72e6ffe9a78c03a31e149259f531e0fed826512",
+ "sha256:e3faaf10a0d1e8e23a9b51d1900b72e1635c2d5b0e1bea1c18022486a8e2e52d",
+ "sha256:f7d29a6fc4760300f86ae329e3b6ca28ea9c20823df123a2ea8693e967b29917",
+ "sha256:f8f295db00ef5f8bae530fc39af0b40486ca6068733fb860b42115052206466f"
+ ],
+ "version": "==2020.11.13"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
+ "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
+ ],
+ "index": "ia",
+ "version": "==2.25.1"
+ },
+ "requests-file": {
+ "hashes": [
+ "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e",
+ "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"
+ ],
+ "version": "==1.5.1"
+ },
+ "robotexclusionrulesparser": {
+ "hashes": [
+ "sha256:d23aa14ae8145c13c95612d696736bad52a4bd0819ce8c9437ee745098fb8388"
+ ],
+ "version": "==1.7.1"
+ },
+ "s3transfer": {
+ "hashes": [
+ "sha256:1e28620e5b444652ed752cf87c7e0cb15b0e578972568c6609f0f18212f259ed",
+ "sha256:7fdddb4f22275cf1d32129e21f056337fd2a80b6ccef1664528145b72c49e6d2"
+ ],
+ "version": "==0.3.4"
+ },
+ "schedule": {
+ "hashes": [
+ "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
+ "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ ],
+ "version": "==0.6.0"
+ },
+ "schema": {
+ "hashes": [
+ "sha256:3a03c2e2b22e6a331ae73750ab1da46916da6ca861b16e6f073ac1d1eba43b71",
+ "sha256:b536f2375b49fdf56f36279addae98bd86a8afbd58b3c32ce363c464bed5fc1c"
+ ],
+ "version": "==0.7.2"
+ },
+ "selectolax": {
+ "hashes": [
+ "sha256:01b26820667dcd8dc0ec94ed874ffc8e45043f0da70466544a9328a79282ff41",
+ "sha256:03e4c0b6d8feb16472482c89a1bf0752335d015172263388c94b8089224ed9e6",
+ "sha256:061efe18e01a624e33317d1f98f20d67f025e228f5cf87f198caceadff9e77f5",
+ "sha256:108f0ed757c5e74cd3d15f3ddb615c891711ae9647fb002aca6dbad5c7f0084c",
+ "sha256:13e6a6ec4b8fc43ef3f6586e17ba85832bbcdf8074d9a31a159d87dd81bf2627",
+ "sha256:231ce804a5e186afa4e7f1639f3a2fdefc5151c1094746fa09821c7c9f5dbeb6",
+ "sha256:274d70e46a94a7b673585957574e571b1838afb5862b9edc7477f704a2e8be3f",
+ "sha256:290b9bc9df879c8538899b5d22e8fa272e07c9edc438396d9b9ad631a7689837",
+ "sha256:2fe935472e9c2c14caf38b65a5ea836f0c3d56081945a8588e14f4136e34ba6b",
+ "sha256:37cb0fd1d933ad7321caa68773fda490d686286eaf4d77922686ad14506c4a2c",
+ "sha256:38661265f318459cd93b1a87b20d8b7b5adeaa353cc96e2d5087a05eef9ce8a3",
+ "sha256:3b21ba8862be4445482e6954c61562851cebd9c9e5db73b0865ea4729e7c85b0",
+ "sha256:4208bfab7c5e14d54104b7959ba1d66f67a51044cb1fccbab62d12c6bd905f02",
+ "sha256:4233599d6507e11a6fab67d9e933d8f445859868b4162eb71c849a832935b575",
+ "sha256:4714c5e6b18ad0ca9f2919b39f333590025e46cb0bb248ffe973333bbf18a491",
+ "sha256:4b9f60a689c0453b6e2a6b92dd2407c82167f3d7624b27184842b2b58d5bc353",
+ "sha256:4ebb88d954dabffa3bafad6cdd758612a7d3b84ceee692c5818bbf0fa93c5f6b",
+ "sha256:519335c313c49151e0a282bef88043eab8756732f24eeb42d2a17e68b3ab174e",
+ "sha256:5e2fb6a27bc7760d57f8cc53adcf5b300a021a3f4102df0e5dd8abb436041c28",
+ "sha256:60ba2ce5060bac7d56dedefe1403602aac1b999a60596294ce3a9520e2c95d71",
+ "sha256:6f7f7a1a030c5612529c0e9df46d690b54d22416d500095ddf3985527f8fb78f",
+ "sha256:804f8e954428a1a325a62a88af39e1fef87c22f0689ee3c3a1d8968ee9648f6e",
+ "sha256:88cc811bb3f9c4eac303dde5ba3ecde0972dba8cebf2fb8001467e752c888838",
+ "sha256:8b85a1356e180d235d9ab92bc3dd90d07e78cab1ef324ae9d12207607c9f26f6",
+ "sha256:8d8e3c7c43805628f2112cda81dba0b8f6620912c24ab2d6635f351985097971",
+ "sha256:90da202496bb99a0924cd26c471f455f64308ed13a24500852635aef5014a43f",
+ "sha256:98e8b60fca5ca6e2f0a2a1882f0c1b771612e5016bd6605545e7c20a8baac244",
+ "sha256:a18f75af342476356e5a437fc5215a3b79b58f52b56d9ea6e1a985cc21895952",
+ "sha256:a36581e0a4f74c5a67d22048fbf34221f9d480bde05acc57702b1cffdcb9ecf5",
+ "sha256:a6724cb313cd7805c7cf4252fdf162e7253cf3a933b7c25ac954feed3edc23ce",
+ "sha256:b8632b165d5da9ecbfb671dbfa879a874cd63d2ea66a8d21b065da1236949947",
+ "sha256:bba6127957c3209e141e42077d952cb1df4a5dc23c522ca9038c8013509588d8",
+ "sha256:c47c7602e8cf8bdce03716b0240d2067eec92f3185cffe34813c60706559ae6a",
+ "sha256:c49ac91cb291eae5c396aa87725ad066ba2fd9690289f3ffcde0022e4276b56e",
+ "sha256:d4144619f88bb94ee2c29cccc23b00a020d6d140d84eda8d7fc4da05dc15f352",
+ "sha256:df57fdbbf772b72993e44cdb326b4937d225e0dd2083cce56100593fe791326a",
+ "sha256:e577ea359151e4df515eabc0c6ea1ddda0577971597c5e9908498a80477befc6",
+ "sha256:e6857ac61acbf747ea56f6c8a72968e7a6ba88053a9a2b5b44091bfb97fb1c87",
+ "sha256:ec2e3f6e49ee252c2fd0c0f297513150ec04e59c7aa0236baebeaaf21b83ffef",
+ "sha256:ecdbad6c95b93256df4c3cb14612215bcd754093415615c55a191bb17fd0ebdc",
+ "sha256:f22653fd48a7f835891bab16095c6f983994d68d16925447e537eb6e3ab79fc4",
+ "sha256:f6c637636cc3bd0025dc9bd07fde28d482c93a6c21cf2e88b827a06766b2b314",
+ "sha256:f83648e412aa610bdff1259dc412383fb290427c05f54e4fad1419b16aca19fe",
+ "sha256:f8f8488fa5859b0da7e4a1bd265b5c0bba45dbf8286e6cee17bf95bcb3d5e797"
+ ],
+ "index": "ia",
+ "version": "==0.2.10"
+ },
+ "sentry-sdk": {
+ "extras": [],
+ "hashes": [
+ "sha256:0a711ec952441c2ec89b8f5d226c33bc697914f46e876b44a4edd3e7864cf4d0",
+ "sha256:737a094e49a529dd0fdcaafa9e97cf7c3d5eb964bd229821d640bc77f3502b3f"
+ ],
+ "index": "ia",
+ "version": "==0.19.5"
+ },
+ "six": {
+ "hashes": [
+ "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
+ "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ ],
+ "version": "==1.15.0"
+ },
+ "soupsieve": {
+ "hashes": [
+ "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851",
+ "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"
+ ],
+ "markers": "python_version >= '3.0'",
+ "version": "==2.1"
+ },
+ "surt": {
+ "hashes": [
+ "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720",
+ "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ ],
+ "version": "==0.3.1"
+ },
+ "tld": {
+ "hashes": [
+ "sha256:1a69b2cd4053da5377a0b27e048e97871120abf9cd7a62ff270915d0c11369d6",
+ "sha256:1b63094d893657eadfd61e49580b4225ce958ca3b8013dbb9485372cde5a3434",
+ "sha256:3266e6783825a795244a0ed225126735e8121859113b0a7fc830cc49f7bbdaff",
+ "sha256:478d9b23157c7e3e2d07b0534da3b1e61a619291b6e3f52f5a3510e43acec7e9",
+ "sha256:5bd36b24aeb14e766ef1e5c01b96fe89043db44a579848f716ec03c40af50a6b",
+ "sha256:cf1b7af4c1d9c689ca81ea7cf3cae77d1bfd8aaa4c648b58f76a0b3d32e3f6e0",
+ "sha256:d5938730cdb9ce4b0feac4dc887d971f964dba873a74ad818f0f25c1571c6045"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.12.5"
+ },
+ "tldextract": {
+ "hashes": [
+ "sha256:cfae9bc8bda37c3e8c7c8639711ad20e95dc85b207a256b60b0b23d7ff5540ea",
+ "sha256:e57f22b6d00a28c21673d2048112f1bdcb6a14d4711568305f6bb96cf5bb53a1"
+ ],
+ "version": "==3.1.0"
+ },
+ "toml": {
+ "hashes": [
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ ],
+ "version": "==0.10.2"
+ },
+ "tqdm": {
+ "hashes": [
+ "sha256:4621f6823bab46a9cc33d48105753ccbea671b68bab2c50a9f0be23d4065cb5a",
+ "sha256:fe3d08dd00a526850568d542ff9de9bbc2a09a791da3c334f3213d8d0bbbca65"
+ ],
+ "version": "==4.56.0"
+ },
+ "trafilatura": {
+ "hashes": [
+ "sha256:c6ab6fe85449796da5c35e174a83a23a06319bccd60d97db19cca09b0a8b6674",
+ "sha256:c8ff3539c54f683e51994a3ca2e92a5858ff13feeec156dad2e1ea31602a43e0"
+ ],
+ "index": "ia",
+ "version": "==0.7.0"
+ },
+ "twitter": {
+ "hashes": [
+ "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
+ "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
+ ],
+ "version": "==1.18.0"
+ },
+ "typed-ast": {
+ "hashes": [
+ "sha256:07d49388d5bf7e863f7fa2f124b1b1d89d8aa0e2f7812faff0a5658c01c59aa1",
+ "sha256:14bf1522cdee369e8f5581238edac09150c765ec1cb33615855889cf33dcb92d",
+ "sha256:240296b27397e4e37874abb1df2a608a92df85cf3e2a04d0d4d61055c8305ba6",
+ "sha256:36d829b31ab67d6fcb30e185ec996e1f72b892255a745d3a82138c97d21ed1cd",
+ "sha256:37f48d46d733d57cc70fd5f30572d11ab8ed92da6e6b28e024e4a3edfb456e37",
+ "sha256:4c790331247081ea7c632a76d5b2a265e6d325ecd3179d06e9cf8d46d90dd151",
+ "sha256:5dcfc2e264bd8a1db8b11a892bd1647154ce03eeba94b461effe68790d8b8e07",
+ "sha256:7147e2a76c75f0f64c4319886e7639e490fee87c9d25cb1d4faef1d8cf83a440",
+ "sha256:7703620125e4fb79b64aa52427ec192822e9f45d37d4b6625ab37ef403e1df70",
+ "sha256:8368f83e93c7156ccd40e49a783a6a6850ca25b556c0fa0240ed0f659d2fe496",
+ "sha256:84aa6223d71012c68d577c83f4e7db50d11d6b1399a9c779046d75e24bed74ea",
+ "sha256:85f95aa97a35bdb2f2f7d10ec5bbdac0aeb9dafdaf88e17492da0504de2e6400",
+ "sha256:8db0e856712f79c45956da0c9a40ca4246abc3485ae0d7ecc86a20f5e4c09abc",
+ "sha256:9044ef2df88d7f33692ae3f18d3be63dec69c4fb1b5a4a9ac950f9b4ba571606",
+ "sha256:963c80b583b0661918718b095e02303d8078950b26cc00b5e5ea9ababe0de1fc",
+ "sha256:987f15737aba2ab5f3928c617ccf1ce412e2e321c77ab16ca5a293e7bbffd581",
+ "sha256:9ec45db0c766f196ae629e509f059ff05fc3148f9ffd28f3cfe75d4afb485412",
+ "sha256:9fc0b3cb5d1720e7141d103cf4819aea239f7d136acf9ee4a69b047b7986175a",
+ "sha256:a2c927c49f2029291fbabd673d51a2180038f8cd5a5b2f290f78c4516be48be2",
+ "sha256:a38878a223bdd37c9709d07cd357bb79f4c760b29210e14ad0fb395294583787",
+ "sha256:b4fcdcfa302538f70929eb7b392f536a237cbe2ed9cba88e3bf5027b39f5f77f",
+ "sha256:c0c74e5579af4b977c8b932f40a5464764b2f86681327410aa028a22d2f54937",
+ "sha256:c1c876fd795b36126f773db9cbb393f19808edd2637e00fd6caba0e25f2c7b64",
+ "sha256:c9aadc4924d4b5799112837b226160428524a9a45f830e0d0f184b19e4090487",
+ "sha256:cc7b98bf58167b7f2db91a4327da24fb93368838eb84a44c472283778fc2446b",
+ "sha256:cf54cfa843f297991b7388c281cb3855d911137223c6b6d2dd82a47ae5125a41",
+ "sha256:d003156bb6a59cda9050e983441b7fa2487f7800d76bdc065566b7d728b4581a",
+ "sha256:d175297e9533d8d37437abc14e8a83cbc68af93cc9c1c59c2c292ec59a0697a3",
+ "sha256:d746a437cdbca200622385305aedd9aef68e8a645e385cc483bdc5e488f07166",
+ "sha256:e683e409e5c45d5c9082dc1daf13f6374300806240719f95dc783d1fc942af10"
+ ],
+ "version": "==1.4.2"
+ },
+ "tzlocal": {
+ "hashes": [
+ "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
+ "sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4"
+ ],
+ "version": "==2.1"
+ },
+ "urlcanon": {
+ "hashes": [
+ "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62"
+ ],
+ "index": "ia",
+ "version": "==0.3.1"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
+ "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
+ ],
+ "markers": "python_version != '3.4'",
+ "version": "==1.26.2"
+ },
+ "warctools": {
+ "hashes": [
+ "sha256:ce0c6e274db8ac8810f7c97b3943e8e8deadbc3f5c982db77cddaae2d2ae6170"
+ ],
+ "version": "==4.10.0"
+ },
+ "wayback": {
+ "extras": [
+ "brotli"
+ ],
+ "hashes": [
+ "sha256:94371e57c4208f5cc6f79a07ec9de2448c71d9dc90600e187658ccb13bf6ebed"
+ ],
+ "index": "ia",
+ "version": "==0.6.5.1"
+ },
+ "wayback-esp": {
+ "hashes": [
+ "sha256:0c000b439a0ffab69c5602c981affccbb5f0bf5b5398759e07f011ff22d39fcc"
+ ],
+ "version": "==0.2.12.1"
+ },
+ "wayback-search-js": {
+ "hashes": [
+ "sha256:bf5aeafcf8e904240c62224bb128f090b6a6eb00c13929e6d5ed684d15c19677"
+ ],
+ "version": "==2.14.0"
+ },
+ "wbex-client": {
+ "hashes": [
+ "sha256:619ead0408195f4eb87198a99e497c649961da45fcf97cb9bc937ef9e06a9e7f"
+ ],
+ "version": "==0.1.6"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784",
+ "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"
+ ],
+ "version": "==0.2.5"
+ },
+ "werkzeug": {
+ "hashes": [
+ "sha256:2de2a5db0baeae7b2d2664949077c2ac63fbd16d98da0ff71837f7d1dea3fd43",
+ "sha256:6c80b1e5ad3665290ea39320b91e1be1e0d5f60652b964a3070216de83d2e47c"
+ ],
+ "version": "==1.0.1"
+ },
+ "zstandard": {
+ "hashes": [
+ "sha256:03c0b012d910c9a78ac046520ebd7b7cd6fe7b93cd4ec3f49b8f0ac39d57d465",
+ "sha256:0fa280d2edd1dd8b61aa5ce46d023d32d31fa05b98070a5b98e7fd4d878bae1d",
+ "sha256:1b8c4c304e694c0664071fb958f0ea7fb9d2ddd008f1f5b4006235910b6eaaed",
+ "sha256:229530dc3d79b114740f4394ffe94c2ecbd3bbadb20696a9cbbf07f65094dcf3",
+ "sha256:326c3cd8a2a87e92dccb0613110922a6fe4bc86e226854943806cf66203fde9f",
+ "sha256:34bc6bd46cd65f90f018b1f684b027e34841993ab91b951b3385d4a6ea2d8772",
+ "sha256:3691de9c2583ab5b43d91dd39b6575dbe0ef3314b676459522413817104b1278",
+ "sha256:3b466c146dbe9bb1f2d2c9beccda631f112d93e5ac226388545acdf0325bd213",
+ "sha256:4c82f430af910dba1b0e0e5e4b67a74c222703b3575fe1ac268c8257626c0729",
+ "sha256:4d7a72f6fe882c5c768602739ad5ae74dbf115bca5ec2ed1012a7986d190658c",
+ "sha256:53e6dac24fabee67ced260fe62d5cf886be92308cbda4d4e217a3a29341f83da",
+ "sha256:5418e2279a47802886241891fcd82b946dc176e1384c00efe87b32f7d633b879",
+ "sha256:5aa171eaa420e11dd32e327f0231fc7a7cd00c3fad5cbd11f27844827b82317d",
+ "sha256:637d513e0ca84375516b9f5af6f4f0020381200c77e8c43d21fca7f66848e9b7",
+ "sha256:6c09eb0c302b42939f9b3db9cec7d92e2292fc54f13c680ca6c429634ebaaf46",
+ "sha256:72af53b92990ba6118d5dabe8a777cdd36d9fb88d9bf665b84517d0efdcc6f8f",
+ "sha256:74a7461dbc8a28d5b9754c56f7748aa4bb25a591b166213dbfa99312f5d45814",
+ "sha256:756f751ca4c0ac8bc7696d2820e14529f00d284358fa2150b4ce57b1a6e16ad3",
+ "sha256:794517afee12005be1038a0b196c9e4e03409fb2f23e218c11393c6ac658eb5f",
+ "sha256:7bc73d3802145ba40677e8fa8185a0c6848b8e582be3d7d7e5132049cd112307",
+ "sha256:7dadb8bd028bd04b1734734227a06a48f64704a6df82be144376db03be95829c",
+ "sha256:83d84fe875c28f98b574fb7d4ec4e5a41e7cdfeb53b177641eb25afdb93b95cd",
+ "sha256:8de12b37d32a9128a72b6050d4c6070a3bc944557f6b9912ecfc421f9ee97824",
+ "sha256:92ea1c604ec49f3e4ffcbbcfeadc96ebc48bc2edc5f0cb5e03e9c1204ed91869",
+ "sha256:945a49c8e7dbfd28e31eeb8b21d2343370ac75d6ec1c79412a3f9bb4c54c5f13",
+ "sha256:95b682d98086c395e1d8c741a414a4fc8bf7c41254aaa85c8613551fd3ede78c",
+ "sha256:993ce06458283a6c55ed93b862f7fa22b9c27b855f04c98c25f207a7687056cd",
+ "sha256:9d9ab430027e3e04a7d4f13f7af693b80cceb728ff25f2e7a16f19851fa0fe3e",
+ "sha256:9de5c54e34c845c70c18561afd106cf754b41e009a4fc4131bfed537abc1468d",
+ "sha256:9e4c5bfcb232491777546265aba9c30c532edcf93f4d16705402732417f81d18",
+ "sha256:a95e0ba90c0a7f7f6e37e1b730b218e31775777b28630e5ebc05c10b4ee1946e",
+ "sha256:a9f8297180e1f291418524345055c9b2dea9b3ca5db3f72b27e3dac67453a361",
+ "sha256:b223633865725c5c3840ba25a7e71abdc7bb8113be6dd88fb88d46ebe7280df9",
+ "sha256:bea22653bf7242320f9cca1c68f8e381134bee8b2c8f36823085b7e398fad11a",
+ "sha256:cb7c6a6f7d62350b9f5539045da54422975630e34dd9069584cc776b9917115f",
+ "sha256:d2c3a460291ee042057a2dfe962b018a3e49cc0ec83efdfa6a714dcbc4d4b351",
+ "sha256:d308f44ca292c0eb117703ee43b995de77c3d3bf99c90b4c6c74e75a10f62eea",
+ "sha256:db4b9d491fd9ac6b52cc4640da0c8e7b957d1cd80901be30e8be9a470b242d99",
+ "sha256:e0940e9bd36c2d84afd5bd0c45a9c6db3aa8956c080bf7df65b3220a23449360",
+ "sha256:e722ae77e072c26b69a3bca5a4d78d5f4927634f527d05a9057d9d58844053aa",
+ "sha256:f12d97f388fc9bb238280641367f49612016d0353a99eff13b58588f60444263",
+ "sha256:f41d39dc81b0b5558891683267e15c36b43eb06571a5fed7da1e0e2663623d9b",
+ "sha256:f81e2d909327927f7afff2f9961234d4747a163091ad8077edf621a2e128bf80",
+ "sha256:f8905bc741dcec0f6a74445fbfadc032772daca4a908dad7d43e8d65ae1fd3f4",
+ "sha256:fc28f71d935f65c70b20f649ec8db572cdbdeff35528f3b922d7ae50e9c5f9a9",
+ "sha256:ff7642a936734781708ece0721c58b238759be4c473f1855d50515a0fa10a5a2"
+ ],
+ "index": "ia",
+ "version": "==0.15.1"
+ }
+ },
+ "develop": {
+ "astroid": {
+ "hashes": [
+ "sha256:2f4078c2a41bf377eea06d71c9d2ba4eb8f6b1af2135bec27bbbb7d8f12bb703",
+ "sha256:bc58d83eb610252fd8de6363e39d4f1d0619c894b0ed24603b881c02e64c7386"
+ ],
+ "version": "==2.4.2"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
+ "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
+ ],
+ "version": "==20.3.0"
+ },
+ "backcall": {
+ "hashes": [
+ "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e",
+ "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"
+ ],
+ "version": "==0.2.0"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
+ "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
+ ],
+ "version": "==2020.12.5"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
+ "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.0"
+ },
+ "coverage": {
+ "hashes": [
+ "sha256:08b3ba72bd981531fd557f67beee376d6700fba183b167857038997ba30dd297",
+ "sha256:2757fa64e11ec12220968f65d086b7a29b6583d16e9a544c889b22ba98555ef1",
+ "sha256:3102bb2c206700a7d28181dbe04d66b30780cde1d1c02c5f3c165cf3d2489497",
+ "sha256:3498b27d8236057def41de3585f317abae235dd3a11d33e01736ffedb2ef8606",
+ "sha256:378ac77af41350a8c6b8801a66021b52da8a05fd77e578b7380e876c0ce4f528",
+ "sha256:38f16b1317b8dd82df67ed5daa5f5e7c959e46579840d77a67a4ceb9cef0a50b",
+ "sha256:3911c2ef96e5ddc748a3c8b4702c61986628bb719b8378bf1e4a6184bbd48fe4",
+ "sha256:3a3c3f8863255f3c31db3889f8055989527173ef6192a283eb6f4db3c579d830",
+ "sha256:3b14b1da110ea50c8bcbadc3b82c3933974dbeea1832e814aab93ca1163cd4c1",
+ "sha256:535dc1e6e68fad5355f9984d5637c33badbdc987b0c0d303ee95a6c979c9516f",
+ "sha256:6f61319e33222591f885c598e3e24f6a4be3533c1d70c19e0dc59e83a71ce27d",
+ "sha256:723d22d324e7997a651478e9c5a3120a0ecbc9a7e94071f7e1954562a8806cf3",
+ "sha256:76b2775dda7e78680d688daabcb485dc87cf5e3184a0b3e012e1d40e38527cc8",
+ "sha256:782a5c7df9f91979a7a21792e09b34a658058896628217ae6362088b123c8500",
+ "sha256:7e4d159021c2029b958b2363abec4a11db0ce8cd43abb0d9ce44284cb97217e7",
+ "sha256:8dacc4073c359f40fcf73aede8428c35f84639baad7e1b46fce5ab7a8a7be4bb",
+ "sha256:8f33d1156241c43755137288dea619105477961cfa7e47f48dbf96bc2c30720b",
+ "sha256:8ffd4b204d7de77b5dd558cdff986a8274796a1e57813ed005b33fd97e29f059",
+ "sha256:93a280c9eb736a0dcca19296f3c30c720cb41a71b1f9e617f341f0a8e791a69b",
+ "sha256:9a4f66259bdd6964d8cf26142733c81fb562252db74ea367d9beb4f815478e72",
+ "sha256:9a9d4ff06804920388aab69c5ea8a77525cf165356db70131616acd269e19b36",
+ "sha256:a2070c5affdb3a5e751f24208c5c4f3d5f008fa04d28731416e023c93b275277",
+ "sha256:a4857f7e2bc6921dbd487c5c88b84f5633de3e7d416c4dc0bb70256775551a6c",
+ "sha256:a607ae05b6c96057ba86c811d9c43423f35e03874ffb03fbdcd45e0637e8b631",
+ "sha256:a66ca3bdf21c653e47f726ca57f46ba7fc1f260ad99ba783acc3e58e3ebdb9ff",
+ "sha256:ab110c48bc3d97b4d19af41865e14531f300b482da21783fdaacd159251890e8",
+ "sha256:b239711e774c8eb910e9b1ac719f02f5ae4bf35fa0420f438cdc3a7e4e7dd6ec",
+ "sha256:be0416074d7f253865bb67630cf7210cbc14eb05f4099cc0f82430135aaa7a3b",
+ "sha256:c46643970dff9f5c976c6512fd35768c4a3819f01f61169d8cdac3f9290903b7",
+ "sha256:c5ec71fd4a43b6d84ddb88c1df94572479d9a26ef3f150cef3dacefecf888105",
+ "sha256:c6e5174f8ca585755988bc278c8bb5d02d9dc2e971591ef4a1baabdf2d99589b",
+ "sha256:c89b558f8a9a5a6f2cfc923c304d49f0ce629c3bd85cb442ca258ec20366394c",
+ "sha256:cc44e3545d908ecf3e5773266c487ad1877be718d9dc65fc7eb6e7d14960985b",
+ "sha256:cc6f8246e74dd210d7e2b56c76ceaba1cc52b025cd75dbe96eb48791e0250e98",
+ "sha256:cd556c79ad665faeae28020a0ab3bda6cd47d94bec48e36970719b0b86e4dcf4",
+ "sha256:ce6f3a147b4b1a8b09aae48517ae91139b1b010c5f36423fa2b866a8b23df879",
+ "sha256:ceb499d2b3d1d7b7ba23abe8bf26df5f06ba8c71127f188333dddcf356b4b63f",
+ "sha256:cef06fb382557f66d81d804230c11ab292d94b840b3cb7bf4450778377b592f4",
+ "sha256:e448f56cfeae7b1b3b5bcd99bb377cde7c4eb1970a525c770720a352bc4c8044",
+ "sha256:e52d3d95df81c8f6b2a1685aabffadf2d2d9ad97203a40f8d61e51b70f191e4e",
+ "sha256:ee2f1d1c223c3d2c24e3afbb2dd38be3f03b1a8d6a83ee3d9eb8c36a52bee899",
+ "sha256:f2c6888eada180814b8583c3e793f3f343a692fc802546eed45f40a001b1169f",
+ "sha256:f51dbba78d68a44e99d484ca8c8f604f17e957c1ca09c3ebc2c7e3bbd9ba0448",
+ "sha256:f54de00baf200b4539a5a092a759f000b5f45fd226d6d25a76b0dff71177a714",
+ "sha256:fa10fee7e32213f5c7b0d6428ea92e3a3fdd6d725590238a3f92c0de1c78b9d2",
+ "sha256:fabeeb121735d47d8eab8671b6b031ce08514c86b7ad8f7d5490a7b6dcd6267d",
+ "sha256:fac3c432851038b3e6afe086f777732bcf7f6ebbfd90951fa04ee53db6d0bcdd",
+ "sha256:fda29412a66099af6d6de0baa6bd7c52674de177ec2ad2630ca264142d69c6c7",
+ "sha256:ff1330e8bc996570221b450e2d539134baa9465f5cb98aff0e0f73f34172e0ae"
+ ],
+ "version": "==5.3.1"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
+ "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ ],
+ "version": "==4.4.2"
+ },
+ "flake8": {
+ "hashes": [
+ "sha256:749dbbd6bfd0cf1318af27bf97a14e28e5ff548ef8e5b1566ccfb25a11e7c839",
+ "sha256:aadae8761ec651813c24be05c6f7b4680857ef6afaae4651a4eccaef97ce6c3b"
+ ],
+ "index": "ia",
+ "version": "==3.8.4"
+ },
+ "flake8-annotations": {
+ "hashes": [
+ "sha256:3a377140556aecf11fa9f3bb18c10db01f5ea56dc79a730e2ec9b4f1f49e2055",
+ "sha256:e17947a48a5b9f632fe0c72682fc797c385e451048e7dfb20139f448a074cb3e"
+ ],
+ "index": "ia",
+ "version": "==2.5.0"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "iniconfig": {
+ "hashes": [
+ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+ "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+ ],
+ "version": "==1.1.1"
+ },
+ "ipython": {
+ "hashes": [
+ "sha256:c987e8178ced651532b3b1ff9965925bfd445c279239697052561a9ab806d28f",
+ "sha256:cbb2ef3d5961d44e6a963b9817d4ea4e1fa2eb589c371a470fed14d8d40cbd6a"
+ ],
+ "index": "ia",
+ "version": "==7.19.0"
+ },
+ "ipython-genutils": {
+ "hashes": [
+ "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+ "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+ ],
+ "version": "==0.2.0"
+ },
+ "isort": {
+ "hashes": [
+ "sha256:c729845434366216d320e936b8ad6f9d681aab72dc7cbc2d51bedc3582f3ad1e",
+ "sha256:fff4f0c04e1825522ce6949973e83110a6e907750cd92d128b0d14aaaadbffdc"
+ ],
+ "version": "==5.7.0"
+ },
+ "jedi": {
+ "hashes": [
+ "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93",
+ "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"
+ ],
+ "version": "==0.18.0"
+ },
+ "lazy-object-proxy": {
+ "hashes": [
+ "sha256:0c4b206227a8097f05c4dbdd323c50edf81f15db3b8dc064d08c62d37e1a504d",
+ "sha256:194d092e6f246b906e8f70884e620e459fc54db3259e60cf69a4d66c3fda3449",
+ "sha256:1be7e4c9f96948003609aa6c974ae59830a6baecc5376c25c92d7d697e684c08",
+ "sha256:4677f594e474c91da97f489fea5b7daa17b5517190899cf213697e48d3902f5a",
+ "sha256:48dab84ebd4831077b150572aec802f303117c8cc5c871e182447281ebf3ac50",
+ "sha256:5541cada25cd173702dbd99f8e22434105456314462326f06dba3e180f203dfd",
+ "sha256:59f79fef100b09564bc2df42ea2d8d21a64fdcda64979c0fa3db7bdaabaf6239",
+ "sha256:8d859b89baf8ef7f8bc6b00aa20316483d67f0b1cbf422f5b4dc56701c8f2ffb",
+ "sha256:9254f4358b9b541e3441b007a0ea0764b9d056afdeafc1a5569eee1cc6c1b9ea",
+ "sha256:9651375199045a358eb6741df3e02a651e0330be090b3bc79f6d0de31a80ec3e",
+ "sha256:97bb5884f6f1cdce0099f86b907aa41c970c3c672ac8b9c8352789e103cf3156",
+ "sha256:9b15f3f4c0f35727d3a0fba4b770b3c4ebbb1fa907dbcc046a1d2799f3edd142",
+ "sha256:a2238e9d1bb71a56cd710611a1614d1194dc10a175c1e08d75e1a7bcc250d442",
+ "sha256:a6ae12d08c0bf9909ce12385803a543bfe99b95fe01e752536a60af2b7797c62",
+ "sha256:ca0a928a3ddbc5725be2dd1cf895ec0a254798915fb3a36af0964a0a4149e3db",
+ "sha256:cb2c7c57005a6804ab66f106ceb8482da55f5314b7fcb06551db1edae4ad1531",
+ "sha256:d74bb8693bf9cf75ac3b47a54d716bbb1a92648d5f781fc799347cfc95952383",
+ "sha256:d945239a5639b3ff35b70a88c5f2f491913eb94871780ebfabb2568bd58afc5a",
+ "sha256:eba7011090323c1dadf18b3b689845fd96a61ba0a1dfbd7f24b921398affc357",
+ "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4",
+ "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0"
+ ],
+ "version": "==1.4.3"
+ },
+ "mccabe": {
+ "hashes": [
+ "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+ "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ ],
+ "version": "==0.6.1"
+ },
+ "mypy": {
+ "hashes": [
+ "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324",
+ "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc",
+ "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802",
+ "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122",
+ "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975",
+ "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7",
+ "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666",
+ "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669",
+ "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178",
+ "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01",
+ "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea",
+ "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de",
+ "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1",
+ "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c"
+ ],
+ "index": "ia",
+ "version": "==0.790"
+ },
+ "mypy-extensions": {
+ "hashes": [
+ "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+ "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
+ ],
+ "version": "==0.4.3"
+ },
+ "packaging": {
+ "hashes": [
+ "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858",
+ "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"
+ ],
+ "version": "==20.8"
+ },
+ "parso": {
+ "hashes": [
+ "sha256:15b00182f472319383252c18d5913b69269590616c947747bc50bf4ac768f410",
+ "sha256:8519430ad07087d4c997fda3a7918f7cfa27cb58972a8c89c2a0295a1c940e9e"
+ ],
+ "version": "==0.8.1"
+ },
+ "pexpect": {
+ "hashes": [
+ "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937",
+ "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"
+ ],
+ "markers": "sys_platform != 'win32'",
+ "version": "==4.8.0"
+ },
+ "pickleshare": {
+ "hashes": [
+ "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+ "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+ ],
+ "version": "==0.7.5"
+ },
+ "pluggy": {
+ "hashes": [
+ "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
+ "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
+ ],
+ "version": "==0.13.1"
+ },
+ "prompt-toolkit": {
+ "hashes": [
+ "sha256:ac329c69bd8564cb491940511957312c7b8959bb5b3cf3582b406068a51d5bb7",
+ "sha256:b8b3d0bde65da350290c46a8f54f336b3cbf5464a4ac11239668d986852e79d5"
+ ],
+ "version": "==3.0.10"
+ },
+ "ptyprocess": {
+ "hashes": [
+ "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35",
+ "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"
+ ],
+ "version": "==0.7.0"
+ },
+ "py": {
+ "hashes": [
+ "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
+ "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+ ],
+ "version": "==1.10.0"
+ },
+ "pycodestyle": {
+ "hashes": [
+ "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367",
+ "sha256:c58a7d2815e0e8d7972bf1803331fb0152f867bd89adf8a01dfd55085434192e"
+ ],
+ "version": "==2.6.0"
+ },
+ "pyflakes": {
+ "hashes": [
+ "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92",
+ "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"
+ ],
+ "version": "==2.2.0"
+ },
+ "pygments": {
+ "hashes": [
+ "sha256:bc9591213a8f0e0ca1a5e68a479b4887fdc3e75d0774e5c71c31920c427de435",
+ "sha256:df49d09b498e83c1a73128295860250b0b7edd4c723a32e9bc0d295c7c2ec337"
+ ],
+ "version": "==2.7.4"
+ },
+ "pylint": {
+ "hashes": [
+ "sha256:bb4a908c9dadbc3aac18860550e870f58e1a02c9f2c204fdf5693d73be061210",
+ "sha256:bfe68f020f8a0fece830a22dd4d5dddb4ecc6137db04face4c3420a46a52239f"
+ ],
+ "index": "ia",
+ "version": "==2.6.0"
+ },
+ "pyparsing": {
+ "hashes": [
+ "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
+ "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
+ ],
+ "version": "==2.4.7"
+ },
+ "pytest": {
+ "hashes": [
+ "sha256:1969f797a1a0dbd8ccf0fecc80262312729afea9c17f1d70ebf85c5e76c6f7c8",
+ "sha256:66e419b1899bc27346cb2c993e12c5e5e8daba9073c1fbce33b9807abc95c306"
+ ],
+ "index": "ia",
+ "version": "==6.2.1"
+ },
+ "pytest-cov": {
+ "hashes": [
+ "sha256:45ec2d5182f89a81fc3eb29e3d1ed3113b9e9a873bcddb2a71faaab066110191",
+ "sha256:47bd0ce14056fdd79f93e1713f88fad7bdcc583dcd7783da86ef2f085a0bb88e"
+ ],
+ "index": "ia",
+ "version": "==2.10.1"
+ },
+ "pytest-mock": {
+ "hashes": [
+ "sha256:379b391cfad22422ea2e252bdfc008edd08509029bcde3c25b2c0bd741e0424e",
+ "sha256:a1e2aba6af9560d313c642dae7e00a2a12b022b80301d9d7fc8ec6858e1dd9fc"
+ ],
+ "index": "ia",
+ "version": "==3.5.1"
+ },
+ "pytest-pylint": {
+ "hashes": [
+ "sha256:790c7a8019fab08e59bd3812db1657a01995a975af8b1c6ce95b9aa39d61da27",
+ "sha256:b63aaf8b80ff33c8ceaa7f68323ed04102c7790093ccf6bdb261a4c2dc6fd564"
+ ],
+ "index": "ia",
+ "version": "==0.18.0"
+ },
+ "pytest-pythonpath": {
+ "hashes": [
+ "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ ],
+ "index": "ia",
+ "version": "==0.7.3"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
+ "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
+ ],
+ "index": "ia",
+ "version": "==2.25.1"
+ },
+ "responses": {
+ "hashes": [
+ "sha256:2e5764325c6b624e42b428688f2111fea166af46623cb0127c05f6afb14d3457",
+ "sha256:ef265bd3200bdef5ec17912fc64a23570ba23597fd54ca75c18650fa1699213d"
+ ],
+ "index": "ia",
+ "version": "==0.12.1"
+ },
+ "six": {
+ "hashes": [
+ "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
+ "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ ],
+ "version": "==1.15.0"
+ },
+ "toml": {
+ "hashes": [
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ ],
+ "version": "==0.10.2"
+ },
+ "traitlets": {
+ "hashes": [
+ "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396",
+ "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"
+ ],
+ "version": "==5.0.5"
+ },
+ "typed-ast": {
+ "hashes": [
+ "sha256:07d49388d5bf7e863f7fa2f124b1b1d89d8aa0e2f7812faff0a5658c01c59aa1",
+ "sha256:14bf1522cdee369e8f5581238edac09150c765ec1cb33615855889cf33dcb92d",
+ "sha256:240296b27397e4e37874abb1df2a608a92df85cf3e2a04d0d4d61055c8305ba6",
+ "sha256:36d829b31ab67d6fcb30e185ec996e1f72b892255a745d3a82138c97d21ed1cd",
+ "sha256:37f48d46d733d57cc70fd5f30572d11ab8ed92da6e6b28e024e4a3edfb456e37",
+ "sha256:4c790331247081ea7c632a76d5b2a265e6d325ecd3179d06e9cf8d46d90dd151",
+ "sha256:5dcfc2e264bd8a1db8b11a892bd1647154ce03eeba94b461effe68790d8b8e07",
+ "sha256:7147e2a76c75f0f64c4319886e7639e490fee87c9d25cb1d4faef1d8cf83a440",
+ "sha256:7703620125e4fb79b64aa52427ec192822e9f45d37d4b6625ab37ef403e1df70",
+ "sha256:8368f83e93c7156ccd40e49a783a6a6850ca25b556c0fa0240ed0f659d2fe496",
+ "sha256:84aa6223d71012c68d577c83f4e7db50d11d6b1399a9c779046d75e24bed74ea",
+ "sha256:85f95aa97a35bdb2f2f7d10ec5bbdac0aeb9dafdaf88e17492da0504de2e6400",
+ "sha256:8db0e856712f79c45956da0c9a40ca4246abc3485ae0d7ecc86a20f5e4c09abc",
+ "sha256:9044ef2df88d7f33692ae3f18d3be63dec69c4fb1b5a4a9ac950f9b4ba571606",
+ "sha256:963c80b583b0661918718b095e02303d8078950b26cc00b5e5ea9ababe0de1fc",
+ "sha256:987f15737aba2ab5f3928c617ccf1ce412e2e321c77ab16ca5a293e7bbffd581",
+ "sha256:9ec45db0c766f196ae629e509f059ff05fc3148f9ffd28f3cfe75d4afb485412",
+ "sha256:9fc0b3cb5d1720e7141d103cf4819aea239f7d136acf9ee4a69b047b7986175a",
+ "sha256:a2c927c49f2029291fbabd673d51a2180038f8cd5a5b2f290f78c4516be48be2",
+ "sha256:a38878a223bdd37c9709d07cd357bb79f4c760b29210e14ad0fb395294583787",
+ "sha256:b4fcdcfa302538f70929eb7b392f536a237cbe2ed9cba88e3bf5027b39f5f77f",
+ "sha256:c0c74e5579af4b977c8b932f40a5464764b2f86681327410aa028a22d2f54937",
+ "sha256:c1c876fd795b36126f773db9cbb393f19808edd2637e00fd6caba0e25f2c7b64",
+ "sha256:c9aadc4924d4b5799112837b226160428524a9a45f830e0d0f184b19e4090487",
+ "sha256:cc7b98bf58167b7f2db91a4327da24fb93368838eb84a44c472283778fc2446b",
+ "sha256:cf54cfa843f297991b7388c281cb3855d911137223c6b6d2dd82a47ae5125a41",
+ "sha256:d003156bb6a59cda9050e983441b7fa2487f7800d76bdc065566b7d728b4581a",
+ "sha256:d175297e9533d8d37437abc14e8a83cbc68af93cc9c1c59c2c292ec59a0697a3",
+ "sha256:d746a437cdbca200622385305aedd9aef68e8a645e385cc483bdc5e488f07166",
+ "sha256:e683e409e5c45d5c9082dc1daf13f6374300806240719f95dc783d1fc942af10"
+ ],
+ "version": "==1.4.2"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
+ "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
+ "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
+ ],
+ "version": "==3.7.4.3"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
+ "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
+ ],
+ "markers": "python_version != '3.4'",
+ "version": "==1.26.2"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784",
+ "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"
+ ],
+ "version": "==0.2.5"
+ },
+ "wrapt": {
+ "hashes": [
+ "sha256:b62ffa81fb85f4332a4f609cab4ac40709470da05643a082ec1eb88e6d9b97d7"
+ ],
+ "version": "==1.12.1"
+ }
+ }
+}
diff --git a/python/TODO b/python/TODO
new file mode 100644
index 0000000..58a463f
--- /dev/null
+++ b/python/TODO
@@ -0,0 +1,7 @@
+
+ingest crawler:
+- SPNv2 only
+ - remove most SPNv1/v2 path selection
+- landing page + fulltext hops only (short recursion depth)
+- use wayback client library instead of requests to fetch content
+- https://pypi.org/project/ratelimit/
diff --git a/python/example.env b/python/example.env
new file mode 100644
index 0000000..5064c96
--- /dev/null
+++ b/python/example.env
@@ -0,0 +1,7 @@
+SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin"
+SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"
+IA_ACCESS_KEY="dummy"
+IA_SECRET_KEY="dummy"
+CDX_AUTH_TOKEN="dummy"
+PETABOX_WEBDATA_SECRET="dummy"
+SENTRY_DSN=""
diff --git a/python/grobid2json.py b/python/grobid2json.py
new file mode 100755
index 0000000..a22d47d
--- /dev/null
+++ b/python/grobid2json.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+
+"""
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
+This script tries to extract everything from a GROBID TEI XML fulltext dump:
+
+- header metadata
+- affiliations
+- references (with context)
+- abstract
+- fulltext
+- tables, figures, equations
+
+A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
+
+- abstract
+- fulltext
+- tables, figures, equations
+
+Prints JSON to stdout, errors to stderr
+"""
+
+import io
+import json
+import argparse
+import xml.etree.ElementTree as ET
+from typing import List, Any, Dict, AnyStr, Optional
+
+xml_ns = "http://www.w3.org/XML/1998/namespace"
+ns = "http://www.tei-c.org/ns/1.0"
+
+
+def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
+ if not elem:
+ return []
+ names = []
+ for author in elem.findall(".//{%s}author" % ns):
+ pn = author.find("./{%s}persName" % ns)
+ if not pn:
+ continue
+ given_name = pn.findtext("./{%s}forename" % ns) or None
+ surname = pn.findtext("./{%s}surname" % ns) or None
+ full_name = " ".join(pn.itertext())
+ obj: Dict[str, Any] = dict(name=full_name)
+ if given_name:
+ obj["given_name"] = given_name
+ if surname:
+ obj["surname"] = surname
+ ae = author.find("./{%s}affiliation" % ns)
+ if ae:
+ affiliation: Dict[str, Any] = dict()
+ for on in ae.findall("./{%s}orgName" % ns):
+ on_type = on.get("type")
+ if on_type:
+ affiliation[on_type] = on.text
+ addr_e = ae.find("./{%s}address" % ns)
+ if addr_e:
+ address = dict()
+ for t in addr_e:
+ address[t.tag.split("}")[-1]] = t.text
+ if address:
+ affiliation["address"] = address
+ # affiliation['address'] = {
+ # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
+ # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
+ # 'country': addr.findtext('./{%s}country' % ns) or None,
+ # }
+ obj["affiliation"] = affiliation
+ names.append(obj)
+ return names
+
+
+def journal_info(elem: ET.Element) -> Dict[str, Any]:
+ journal = dict()
+ journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
+ journal["publisher"] = elem.findtext(
+ ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
+ )
+ if journal["publisher"] == "":
+ journal["publisher"] = None
+ journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
+ journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
+ journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ keys = list(journal.keys())
+
+ # remove empty/null keys
+ for k in keys:
+ if not journal[k]:
+ journal.pop(k)
+ return journal
+
+
+def biblio_info(elem: ET.Element) -> Dict[str, Any]:
+ ref: Dict[str, Any] = dict()
+ ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
+ # Title stuff is messy in references...
+ ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
+ other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
+ if other_title:
+ if ref["title"]:
+ ref["journal"] = other_title
+ else:
+ ref["journal"] = None
+ ref["title"] = other_title
+ ref["authors"] = all_authors(elem)
+ ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
+ if ref["publisher"] == "":
+ ref["publisher"] = None
+ date = elem.find('.//{%s}date[@type="published"]' % ns)
+ ref["date"] = (date is not None) and date.attrib.get("when")
+ ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ el = elem.find(".//{%s}ptr[@target]" % ns)
+ if el is not None:
+ ref["url"] = el.attrib["target"]
+ # Hand correction
+ if ref["url"].endswith(".Lastaccessed"):
+ ref["url"] = ref["url"].replace(".Lastaccessed", "")
+ else:
+ ref["url"] = None
+ return ref
+
+
+def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
+
+ if isinstance(content, str):
+ tree = ET.parse(io.StringIO(content))
+ elif isinstance(content, bytes):
+ tree = ET.parse(io.BytesIO(content))
+
+ info: Dict[str, Any] = dict()
+
+ # print(content)
+ # print(content.getvalue())
+ tei = tree.getroot()
+
+ header = tei.find(".//{%s}teiHeader" % ns)
+ if header is None:
+ raise ValueError("XML does not look like TEI format")
+ application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0]
+ info["grobid_version"] = application_tag.attrib["version"].strip()
+ info["grobid_timestamp"] = application_tag.attrib["when"].strip()
+ info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
+ info["authors"] = all_authors(
+ header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
+ )
+ info["journal"] = journal_info(header)
+ date = header.find('.//{%s}date[@type="published"]' % ns)
+ info["date"] = (date is not None) and date.attrib.get("when")
+ info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
+ info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
+ if info["doi"]:
+ info["doi"] = info["doi"].lower()
+
+ refs = []
+ for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))):
+ ref = biblio_info(bs)
+ ref["index"] = i
+ refs.append(ref)
+ info["citations"] = refs
+
+ text = tei.find(".//{%s}text" % (ns))
+ # print(text.attrib)
+ if text and text.attrib.get("{%s}lang" % xml_ns):
+ info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang
+
+ if encumbered:
+ el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns))
+ info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find(".//{%s}text/{%s}body" % (ns, ns))
+ info["body"] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
+ info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
+ info["annex"] = (el or None) and " ".join(el.itertext()).strip()
+
+ # remove empty/null keys
+ keys = list(info.keys())
+ for k in keys:
+ if not info[k]:
+ info.pop(k)
+ return info
+
+
+def main() -> None: # pragma no cover
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description="GROBID TEI XML to JSON",
+ usage="%(prog)s [options] <teifile>...",
+ )
+ parser.add_argument(
+ "--no-encumbered",
+ action="store_true",
+ help="don't include ambiguously copyright encumbered fields (eg, abstract, body)",
+ )
+ parser.add_argument("teifiles", nargs="+")
+
+ args = parser.parse_args()
+
+ for filename in args.teifiles:
+ content = open(filename, "r").read()
+ print(
+ json.dumps(
+ teixml2json(content, encumbered=(not args.no_encumbered)),
+ sort_keys=True,
+ )
+ )
+
+
+if __name__ == "__main__": # pragma no cover
+ main()
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
new file mode 100755
index 0000000..2a1d8b5
--- /dev/null
+++ b/python/grobid_tool.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+"""
+These are generally for running one-off tasks from the command line. Output
+might go to stdout, or might go to Kafka topic.
+
+Example of large parallel run, locally:
+
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+"""
+
+import sys
+import json
+import argparse
+import datetime
+
+from grobid2json import teixml2json
+from sandcrawler import *
+
+
+def run_extract_json(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ else:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+def run_extract_cdx(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ batch_size=args.jobs,
+ )
+ else:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink)
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
+ pusher.run()
+
+def run_extract_zipfile(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ if args.jobs > 1:
+ print("multi-processing: {}".format(args.jobs), file=sys.stderr)
+ worker = GrobidBlobWorker(grobid_client, sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs)
+ pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs)
+ else:
+ worker = GrobidBlobWorker(grobid_client, sink=args.sink)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+def run_transform(args):
+ grobid_client = GrobidClient()
+ for line in args.json_file:
+ if not line.strip():
+ continue
+ line = json.loads(line)
+ if args.metadata_only:
+ out = grobid_client.metadata(line)
+ else:
+ out = teixml2json(line['tei_xml'])
+ if out:
+ if 'source' in line:
+ out['source'] = line['source']
+ print(json.dumps(out))
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--kafka-mode',
+ action='store_true',
+ help="send output to Kafka (not stdout)")
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--kafka-env',
+ default="dev",
+ help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ parser.add_argument('-j', '--jobs',
+ default=8, type=int,
+ help="parallelism for batch CPU jobs")
+ parser.add_argument('--grobid-host',
+ default="http://grobid.qa.fatcat.wiki",
+ help="GROBID API host/port")
+ subparsers = parser.add_subparsers()
+
+ sub_extract_json = subparsers.add_parser('extract-json',
+ help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
+ sub_extract_json.set_defaults(func=run_extract_json)
+ sub_extract_json.add_argument('json_file',
+ help="JSON file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_extract_cdx = subparsers.add_parser('extract-cdx',
+ help="for each CDX line, fetches PDF and does GROBID extraction")
+ sub_extract_cdx.set_defaults(func=run_extract_cdx)
+ sub_extract_cdx.add_argument('cdx_file',
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
+ help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
+ sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
+ sub_extract_zipfile.add_argument('zip_file',
+ help="zipfile with PDFs to extract",
+ type=str)
+
+ sub_transform = subparsers.add_parser('transform')
+ sub_transform.set_defaults(func=run_transform)
+ sub_transform.add_argument('--metadata-only',
+ action='store_true',
+ help="Only pass through bibliographic metadata, not fulltext")
+ sub_transform.add_argument('json_file',
+ help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
+ type=argparse.FileType('r'))
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.sink = None
+ if args.kafka_mode:
+ produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
+ print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic)
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
new file mode 100755
index 0000000..20c65bb
--- /dev/null
+++ b/python/ia_pdf_match.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+"""
+Input is IA item metadata JSON.
+Ouput is insertable fatcat "match" JSON
+
+- md5
+- sha1
+- sha256
+- size
+- urls
+- cdx (list; empty here)
+
+- dois (list)
+- pmcid
+- jstor
+- arxiv
+
+When invoking import matched, be sure to:
+
+ --default-link-rel repository (?)
+ --default-mimetype application/pdf
+"""
+
+import sys
+import json
+
+def parse(obj):
+ if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
+ print('skip: test item', file=sys.stderr)
+ return None
+
+ extid_type = None
+ extid = None
+ if obj['metadata']['identifier'].startswith('arxiv-'):
+ extid_type = 'arxiv'
+ extid = obj['metadata'].get('source')
+ if not extid:
+ print('skip: no source', file=sys.stderr)
+ return None
+ assert extid.startswith('http://arxiv.org/abs/')
+ extid = extid.replace('http://arxiv.org/abs/', '')
+ #print(extid)
+ assert '/' in extid or '.' in extid
+ if not 'v' in extid or not extid[-1].isdigit():
+ print('skip: non-versioned arxiv_id', file=sys.stderr)
+ return None
+ elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
+ extid_type = 'doi'
+ extid = obj['metadata']['identifier-doi']
+ assert extid.startswith("10.")
+ elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
+ extid_type = 'pmcid'
+ extid = obj['metadata']['identifier'].replace('pubmed-', '')
+ assert extid.startswith("PMC")
+ int(extid[3:])
+ elif obj['metadata']['identifier'].startswith('jstor-'):
+ extid_type = 'jstor'
+ extid = obj['metadata']['identifier'].replace('jstor-', '')
+ int(extid)
+ else:
+ raise NotImplementedError()
+
+ pdf_file = None
+ for f in obj['files']:
+ if f['source'] == "original" and "PDF" in f['format']:
+ pdf_file = f
+ break
+ if not pdf_file:
+ print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr)
+ #for f in obj['files']:
+ # print(f['format'], file=sys.stderr)
+ return None
+
+ assert pdf_file['name'].endswith('.pdf')
+
+ match = {
+ 'md5': pdf_file['md5'],
+ 'sha1': pdf_file['sha1'],
+ 'size': int(pdf_file['size']),
+ 'mimetype': 'application/pdf',
+ 'urls': [
+ "https://archive.org/download/{}/{}".format(
+ obj['metadata']['identifier'],
+ pdf_file['name']),
+ ],
+ 'cdx': [],
+ 'dois': [],
+ }
+
+ if extid_type == 'doi':
+ match['dois'] = [extid,]
+ else:
+ match[extid_type] = extid
+
+ return match
+
+def run():
+ for line in sys.stdin:
+ if not line:
+ continue
+ obj = json.loads(line)
+ match = parse(obj)
+ if match:
+ print(json.dumps(match, sort_keys=True))
+
+if __name__ == '__main__':
+ run()
diff --git a/python/ingest_file.py b/python/ingest_file.py
new file mode 100755
index 0000000..20b6d67
--- /dev/null
+++ b/python/ingest_file.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import argparse
+
+from http.server import HTTPServer
+from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker
+
+
+def run_single_ingest(args):
+ request = dict(
+ ingest_type=args.ingest_type,
+ base_url=args.url,
+ ext_ids=dict(doi=args.doi),
+ fatcat=dict(release_ident=args.release_id),
+ )
+ if args.force_recrawl:
+ request['force_recrawl'] = True
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
+ result = ingester.process(request)
+ print(json.dumps(result, sort_keys=True))
+ return result
+
+def run_requests(args):
+ # TODO: switch to using JsonLinePusher
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
+ for l in args.json_file:
+ request = json.loads(l.strip())
+ result = ingester.process(request)
+ print(json.dumps(result, sort_keys=True))
+
+def run_api(args):
+ port = 8083
+ print("Listening on localhost:{}".format(port))
+ server = HTTPServer(('', port), IngestFileRequestHandler)
+ server.serve_forever()
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ sub_single= subparsers.add_parser('single',
+ help="ingests a single file URL")
+ sub_single.set_defaults(func=run_single_ingest)
+ sub_single.add_argument('--release-id',
+ help="(optional) existing release ident to match to")
+ sub_single.add_argument('--doi',
+ help="(optional) existing release DOI to match to")
+ sub_single.add_argument('--force-recrawl',
+ action='store_true',
+ help="ignore GWB history and use SPNv2 to re-crawl")
+ sub_single.add_argument('--no-spn2',
+ action='store_true',
+ help="don't use live web (SPNv2)")
+ sub_single.add_argument('--ingest-type',
+ default="pdf",
+ help="type of ingest (pdf, html, etc)")
+ sub_single.add_argument('--html-quick-mode',
+ action='store_true',
+ help="don't fetch individual sub-resources, just use CDX")
+ sub_single.add_argument('url',
+ help="URL of paper to fetch")
+
+ sub_requests = subparsers.add_parser('requests',
+ help="takes a series of ingest requests (JSON, per line) and runs each")
+ sub_requests.add_argument('--no-spn2',
+ action='store_true',
+ help="don't use live web (SPNv2)")
+ sub_requests.add_argument('--html-quick-mode',
+ action='store_true',
+ help="don't fetch individual sub-resources, just use CDX")
+ sub_requests.set_defaults(func=run_requests)
+ sub_requests.add_argument('json_file',
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+
+ sub_api = subparsers.add_parser('api',
+ help="starts a simple HTTP server that processes ingest requests")
+ sub_api.set_defaults(func=run_api)
+ sub_api.add_argument('--port',
+ help="HTTP port to listen on",
+ default=8033, type=int)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
new file mode 100755
index 0000000..10a0f48
--- /dev/null
+++ b/python/pdfextract_tool.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+
+"""
+KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
+"""
+
+import sys
+import json
+import argparse
+import datetime
+
+from grobid2json import teixml2json
+from sandcrawler import *
+
+
+def run_extract_json(args):
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfExtractWorker(wayback_client, sink=None, thumbnail_sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ else:
+ worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+def run_extract_cdx(args):
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfExtractWorker(wayback_client, sink=None, thumbnail_sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ batch_size=args.jobs,
+ )
+ else:
+ worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
+ pusher.run()
+
+def run_extract_zipfile(args):
+ if args.jobs > 1:
+ print("multi-processing: {}".format(args.jobs), file=sys.stderr)
+ worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs)
+ pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs)
+ else:
+ worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+def run_single(args):
+ worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ with open(args.pdf_file, 'rb') as pdf_file:
+ pdf_bytes = pdf_file.read()
+ worker.push_record(pdf_bytes)
+ worker.finish()
+ if args.thumbnail_sink:
+ args.thumbnail_sink.finish()
+
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--kafka-mode',
+ action='store_true',
+ help="send output to Kafka (not stdout)")
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--kafka-env',
+ default="dev",
+ help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ parser.add_argument('-j', '--jobs',
+ default=8, type=int,
+ help="parallelism for batch CPU jobs")
+ subparsers = parser.add_subparsers()
+
+ sub_extract_json = subparsers.add_parser('extract-json',
+ help="for each JSON line with CDX info, fetches PDF and does PDF extraction")
+ sub_extract_json.set_defaults(func=run_extract_json)
+ sub_extract_json.add_argument('json_file',
+ help="JSON file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_extract_cdx = subparsers.add_parser('extract-cdx',
+ help="for each CDX line, fetches PDF and does PDF extraction")
+ sub_extract_cdx.set_defaults(func=run_extract_cdx)
+ sub_extract_cdx.add_argument('cdx_file',
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
+ help="opens zipfile, iterates over PDF files inside and does PDF extract for each")
+ sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
+ sub_extract_zipfile.add_argument('zip_file',
+ help="zipfile with PDFs to extract",
+ type=str)
+
+ sub_single = subparsers.add_parser('single',
+ help="opens single PDF and extracts it")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument('pdf_file',
+ help="single PDF to extract",
+ type=str)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.text_sink = None
+ args.thumbnail_sink = None
+ if args.kafka_mode:
+ text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
+ produce_topic=text_topic)
+ args.thumbnail_sink = KafkaSink(kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic)
+ print("Running in kafka output mode, publishing to {} and {}\n".format(
+ text_topic, thumbnail_topic), file=sys.stderr)
+ else:
+ args.sink = None
+ args.thumbnail_sink = None
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
new file mode 100755
index 0000000..5cffa8c
--- /dev/null
+++ b/python/pdftrio_tool.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+"""
+Basically just a copy of grobid_tool.py, but for PDF classification instead of
+text extraction.
+
+Example of large parallel run, locally:
+
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+"""
+
+import sys
+import json
+import argparse
+import datetime
+
+from sandcrawler import *
+
+
+def run_classify_pdf_json(args):
+ pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ else:
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+def run_classify_pdf_cdx(args):
+ pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ batch_size=args.jobs,
+ )
+ else:
+ worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
+ pusher.run()
+
+def run_classify_pdf_zipfile(args):
+ pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
+ worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--kafka-mode',
+ action='store_true',
+ help="send output to Kafka (not stdout)")
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--kafka-env',
+ default="dev",
+ help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ parser.add_argument('-j', '--jobs',
+ default=8, type=int,
+ help="parallelism for batch CPU jobs")
+ parser.add_argument('--pdftrio-host',
+ default="http://pdftrio.qa.fatcat.wiki",
+ help="pdftrio API host/port")
+ parser.add_argument('--pdftrio-mode',
+ default="auto",
+ help="which classification mode to use")
+ subparsers = parser.add_subparsers()
+
+ sub_classify_pdf_json = subparsers.add_parser('classify-pdf-json',
+ help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json)
+ sub_classify_pdf_json.add_argument('json_file',
+ help="JSON file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_classify_pdf_cdx = subparsers.add_parser('classify-pdf-cdx',
+ help="for each CDX line, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx)
+ sub_classify_pdf_cdx.add_argument('cdx_file',
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_classify_pdf_zipfile = subparsers.add_parser('classify-pdf-zipfile',
+ help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each")
+ sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile)
+ sub_classify_pdf_zipfile.add_argument('zip_file',
+ help="zipfile with PDFs to classify",
+ type=str)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.sink = None
+ if args.kafka_mode:
+ produce_topic = "sandcrawler-{}.pdftrio-output".format(args.kafka_env)
+ print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
+ args.sink = KafkaSink(kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic)
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/persist_tool.py b/python/persist_tool.py
new file mode 100755
index 0000000..69e9374
--- /dev/null
+++ b/python/persist_tool.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+
+"""
+Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
+
+Normally this is done by workers (in sandcrawler_worker.py) consuming from
+Kafka feeds, but sometimes we have bulk processing output we want to backfill.
+"""
+
+import os
+import sys
+import argparse
+
+from sandcrawler import *
+from sandcrawler.persist import *
+
+
+def run_cdx(args):
+ worker = PersistCdxWorker(
+ db_url=args.db_url,
+ )
+ filter_mimetypes = ['application/pdf']
+ if args.no_mimetype_filter:
+ filter_mimetypes = None
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=filter_mimetypes,
+ #allow_octet_stream
+ batch_size=200,
+ )
+ pusher.run()
+
+def run_grobid(args):
+ worker = PersistGrobidWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=50,
+ )
+ pusher.run()
+
+def run_grobid_disk(args):
+ """
+ Writes XML to individual files on disk, and also prints non-XML metadata to
+ stdout as JSON, which can be redirected to a separate file.
+ """
+ worker = PersistGrobidDiskWorker(
+ output_dir=args.output_dir,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
+def run_pdftrio(args):
+ worker = PersistPdfTrioWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
+def run_pdftext(args):
+ worker = PersistPdfTextWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=50,
+ )
+ pusher.run()
+
+def run_ingest_file_result(args):
+ worker = PersistIngestFileResultWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=200,
+ )
+ pusher.run()
+
+def run_ingest_request(args):
+ worker = PersistIngestRequestWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=200,
+ )
+ pusher.run()
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--db-url',
+ help="postgresql database connection string",
+ default="postgres:///sandcrawler")
+ parser.add_argument('--s3-url',
+ help="S3 (seaweedfs) backend URL",
+ default="localhost:9000")
+ parser.add_argument('--s3-access-key',
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
+ parser.add_argument('--s3-secret-key',
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
+ parser.add_argument('--s3-bucket',
+ help="S3 (seaweedfs) bucket to persist into",
+ default="sandcrawler-dev")
+ subparsers = parser.add_subparsers()
+
+ sub_cdx = subparsers.add_parser('cdx',
+ help="backfill a CDX file into postgresql cdx table")
+ sub_cdx.set_defaults(func=run_cdx)
+ sub_cdx.add_argument('cdx_file',
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_cdx.add_argument('--no-mimetype-filter',
+ action='store_true',
+ help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
+
+ sub_grobid = subparsers.add_parser('grobid',
+ help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_grobid.set_defaults(func=run_grobid)
+ sub_grobid.add_argument('json_file',
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_grobid.add_argument('--s3-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
+
+ sub_pdftext = subparsers.add_parser('pdftext',
+ help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_pdftext.set_defaults(func=run_pdftext)
+ sub_pdftext.add_argument('json_file',
+ help="pdftext file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_pdftext.add_argument('--s3-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_pdftext.add_argument('--db-only',
+ action='store_true',
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
+
+ sub_grobid_disk = subparsers.add_parser('grobid-disk',
+ help="dump GRBOID output to (local) files on disk")
+ sub_grobid_disk.set_defaults(func=run_grobid_disk)
+ sub_grobid_disk.add_argument('json_file',
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_grobid_disk.add_argument('output_dir',
+ help="base directory to output into",
+ type=str)
+
+ sub_pdftrio = subparsers.add_parser('pdftrio',
+ help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_pdftrio.set_defaults(func=run_pdftrio)
+ sub_pdftrio.add_argument('json_file',
+ help="pdftrio file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
+ help="backfill a ingest_file_result JSON dump into postgresql")
+ sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
+ sub_ingest_file_result.add_argument('json_file',
+ help="ingest_file_result file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ sub_ingest_request = subparsers.add_parser('ingest-request',
+ help="backfill a ingest_request JSON dump into postgresql")
+ sub_ingest_request.set_defaults(func=run_ingest_request)
+ sub_ingest_request.add_argument('json_file',
+ help="ingest_request to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("Tell me what to do!", file=sys.stderr)
+ sys.exit(-1)
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/pytest.ini b/python/pytest.ini
new file mode 100644
index 0000000..034a68e
--- /dev/null
+++ b/python/pytest.ini
@@ -0,0 +1,21 @@
+[pytest]
+
+# allow imports from files in current directory
+python_paths = .
+
+# search for 'test_*' functions in all python files, not just under tests
+python_files = *.py
+
+addopts = --pylint --pylint-rcfile=.pylintrc --pylint-error-types=EF --pylint-jobs=4
+
+# ignore various third party warnings (in .venv)
+filterwarnings =
+ ignore:.*common_exception_handling.*StopIteration:PendingDeprecationWarning
+ ignore:.*deprecated and will be removed in Werkzeug 1.0.*:DeprecationWarning
+ ignore::DeprecationWarning:.*surt
+ ignore::DeprecationWarning:.*urllib3
+ ignore::DeprecationWarning:.*wayback
+ ignore::DeprecationWarning:.*PIL
+ ignore::DeprecationWarning:.*justext
+
+log_level = INFO
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
new file mode 100644
index 0000000..e461462
--- /dev/null
+++ b/python/sandcrawler/__init__.py
@@ -0,0 +1,10 @@
+
+from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
+from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
+from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
+from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
+from .ingest import IngestFileWorker
+from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
+from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
+from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
new file mode 100644
index 0000000..e60b310
--- /dev/null
+++ b/python/sandcrawler/db.py
@@ -0,0 +1,418 @@
+
+import json
+import datetime
+from typing import Optional
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+class SandcrawlerPostgrestClient:
+
+ def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs):
+ self.api_url = api_url
+
+ def get_cdx(self, url):
+ resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
+ resp.raise_for_status()
+ return resp.json() or None
+
+ def get_grobid(self, sha1):
+ resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
+ def get_pdftrio(self, sha1):
+ resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.'+sha1))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
+ def get_pdf_meta(self, sha1):
+ resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.'+sha1))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
+ def get_html_meta(self, sha1hex: str) -> Optional[dict]:
+ resp = requests.get(
+ self.api_url + "/html_meta",
+ params=dict(sha1hex=f"eq.{sha1hex}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_file_meta(self, sha1):
+ resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
+ def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = requests.get(
+ self.api_url + "/ingest_file_result",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref(self, doi):
+ resp = requests.get(self.api_url + "/crossref", params=dict(doi='eq.'+doi))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
+class SandcrawlerPostgresClient:
+
+ def __init__(self, db_url, **kwargs):
+ self.conn = psycopg2.connect(db_url)
+
+ def cursor(self):
+ return self.conn.cursor()
+
+ def commit(self):
+ return self.conn.commit()
+
+ def _inserts_and_updates(self, resp, on_conflict):
+ resp = [int(r[0]) for r in resp]
+ inserts = len([r for r in resp if r == 0])
+ if on_conflict == "update":
+ updates = len([r for r in resp if r != 0])
+ else:
+ updates = 0
+ return (inserts, updates)
+
+ def insert_cdx(self, cur, batch, on_conflict="nothing"):
+ sql = """
+ INSERT INTO
+ cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT cdx_pkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+
+ batch = [d for d in batch if d.get('warc_path')]
+ if not batch:
+ return (0, 0)
+ batch = [(d['url'],
+ d['datetime'],
+ d['sha1hex'],
+ d['mimetype'],
+ d['warc_path'],
+ int(d['warc_csize']),
+ int(d['warc_offset']))
+ for d in batch]
+ # filter out duplicate rows by key (url, datetime)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[(b[0], b[1])] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_file_meta(self, cur, batch, on_conflict="nothing"):
+ sql = """
+ INSERT INTO
+ file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ sha256hex=EXCLUDED.sha256hex,
+ md5hex=EXCLUDED.md5hex,
+ size_bytes=EXCLUDED.size_bytes,
+ mimetype=EXCLUDED.mimetype
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [(d['sha1hex'],
+ d['sha256hex'],
+ d['md5hex'],
+ int(d['size_bytes']),
+ d['mimetype'])
+ for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_grobid(self, cur, batch, on_conflict="nothing"):
+ sql = """
+ INSERT INTO
+ grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ grobid_version=EXCLUDED.grobid_version,
+ status_code=EXCLUDED.status_code,
+ status=EXCLUDED.status,
+ fatcat_release=EXCLUDED.fatcat_release,
+ updated=EXCLUDED.updated,
+ metadata=EXCLUDED.metadata
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ for r in batch:
+ if r.get('metadata'):
+ # sometimes these are only in metadata; shouldn't pass through
+ # though (to save database space)
+ dupe_fields = ('fatcat_release', 'grobid_version')
+ for k in dupe_fields:
+ if not k in r:
+ r[k] = r['metadata'].get(k)
+ r['metadata'].pop(k, None)
+ r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
+ batch = [(d['key'],
+ d.get('grobid_version') or None,
+ d['status_code'],
+ d['status'],
+ d.get('fatcat_release') or None,
+ d.get('updated') or datetime.datetime.now(),
+ d.get('metadata') or None ,
+ )
+ for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_pdf_meta(self, cur, batch, on_conflict="nothing"):
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ pdf_meta (sha1hex, updated, status, has_page0_thumbnail, page_count, word_count, page0_height, page0_width, permanent_id, pdf_created, pdf_version, metadata)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ has_page0_thumbnail=EXCLUDED.has_page0_thumbnail,
+ page_count=EXCLUDED.page_count,
+ word_count=EXCLUDED.word_count,
+ page0_height=EXCLUDED.page0_height,
+ page0_width=EXCLUDED.page0_width,
+ permanent_id=EXCLUDED.permanent_id,
+ pdf_created=EXCLUDED.pdf_created,
+ pdf_version=EXCLUDED.pdf_version,
+ metadata=EXCLUDED.metadata
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [d.to_sql_tuple() for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_html_meta(self, cur, batch, on_conflict="nothing"):
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ scope=EXCLUDED.scope,
+ has_teixml=EXCLUDED.has_teixml,
+ has_thumbnail=EXCLUDED.has_thumbnail,
+ word_count=EXCLUDED.word_count,
+ biblio=EXCLUDED.biblio,
+ resources=EXCLUDED.resources
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [d.to_sql_tuple() for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_pdftrio(self, cur, batch, on_conflict="nothing"):
+ sql = """
+ INSERT INTO
+ pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
+ models_date, ensemble_score, bert_score, linear_score,
+ image_score)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status_code=EXCLUDED.status_code,
+ status=EXCLUDED.status,
+ pdftrio_version=EXCLUDED.pdftrio_version,
+ models_date=EXCLUDED.models_date,
+ ensemble_score=EXCLUDED.ensemble_score,
+ bert_score=EXCLUDED.bert_score,
+ linear_score=EXCLUDED.linear_score,
+ image_score=EXCLUDED.image_score
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [
+ (
+ d['key'],
+ d.get('updated') or datetime.datetime.now(),
+ d['status_code'],
+ d['status'],
+ d.get('versions', {}).get('pdftrio_version') or None,
+ d.get('versions', {}).get('models_date') or None,
+ d.get('ensemble_score'),
+ d.get('bert_score'),
+ d.get('linear_score'),
+ d.get('image_score'),
+ )
+ for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_request(self, cur, batch, on_conflict="nothing"):
+ sql = """
+ INSERT INTO
+ ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_request_pkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ for r in batch:
+ # in case these fields were already packed into 'request'
+ extra = r.get('request', {})
+ for k in ('ext_ids', 'fatcat_release', 'edit_extra', 'rel'):
+ if r.get(k):
+ extra[k] = r[k]
+ if extra:
+ r['extra'] = json.dumps(extra, sort_keys=True)
+ batch = [(d['link_source'],
+ d['link_source_id'],
+ d['ingest_type'],
+ d['base_url'],
+ d.get('ingest_request_source'),
+ d.get('release_stage') or None,
+ d.get('extra') or None,
+ )
+ for d in batch]
+ # filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[(b[0], b[1], b[2], b[3])] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_file_result(self, cur, batch, on_conflict="nothing"):
+ sql = """
+ INSERT INTO
+ ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=now(),
+ hit=EXCLUDED.hit,
+ status=EXCLUDED.status,
+ terminal_url=EXCLUDED.terminal_url,
+ terminal_dt=EXCLUDED.terminal_dt,
+ terminal_status_code=EXCLUDED.terminal_status_code,
+ terminal_sha1hex=EXCLUDED.terminal_sha1hex
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [(d['ingest_type'],
+ d['base_url'],
+ bool(d['hit']),
+ d['status'],
+ d.get('terminal_url'),
+ d.get('terminal_dt'),
+ d.get('terminal_status_code'),
+ d.get('terminal_sha1hex'),
+ )
+ for d in batch]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[(b[0], b[1])] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
new file mode 100644
index 0000000..b4215dc
--- /dev/null
+++ b/python/sandcrawler/grobid.py
@@ -0,0 +1,130 @@
+
+import requests
+
+from grobid2json import teixml2json
+from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .misc import gen_file_metadata
+
+class GrobidClient(object):
+
+ def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+ self.host_url = host_url
+ self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
+
+ def process_fulltext(self, blob, consolidate_mode=None):
+ """
+ Returns dict with keys:
+ - status_code
+ - status (slug)
+ - error_msg (if status == 'error')
+ - tei_xml (if status is 200)
+
+ TODO: persist connection for performance?
+ """
+ assert blob
+
+ if consolidate_mode == None:
+ consolidate_mode = self.consolidate_mode
+
+ try:
+ grobid_response = requests.post(
+ self.host_url + "/api/processFulltextDocument",
+ files={
+ 'input': blob,
+ 'consolidateHeader': self.consolidate_mode,
+ 'consolidateCitations': 0, # too expensive for now
+ 'includeRawCitations': 1,
+ },
+ timeout=180.0,
+ )
+ except requests.Timeout:
+ return {
+ 'status': 'error-timeout',
+ 'status_code': -4, # heritrix3 "HTTP timeout" code
+ 'error_msg': 'GROBID request (HTTP POST) timeout',
+ }
+
+ info = dict(
+ status_code=grobid_response.status_code,
+ )
+ if grobid_response.status_code == 200:
+ info['status'] = 'success'
+ info['tei_xml'] = grobid_response.text
+ if len(info['tei_xml']) > 12000000:
+ # XML is larger than Kafka message size, and much larger than
+ # an article in general; bail out
+ info['status'] = 'error'
+ info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
+ info.pop('tei_xml')
+ else:
+ # response.text is .content decoded as utf-8
+ info['status'] = 'error'
+ info['error_msg'] = grobid_response.text[:10000]
+ return info
+
+ def metadata(self, result):
+ if result['status'] != 'success':
+ return None
+ tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ meta = dict()
+ biblio = dict()
+ for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ if tei_json.get(k):
+ biblio[k] = tei_json[k]
+ meta['biblio'] = biblio
+ for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ if tei_json.get(k):
+ meta[k] = tei_json[k]
+ return meta
+
+class GrobidWorker(SandcrawlerFetchWorker):
+
+ def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+ super().__init__(wayback_client=wayback_client)
+ self.grobid_client = grobid_client
+ self.sink = sink
+ self.consolidate_mode = 0
+
+ def timeout_response(self, task):
+ default_key = task['sha1hex']
+ return dict(
+ status="error-timeout",
+ error_msg="internal GROBID worker timeout",
+ source=task,
+ key=default_key,
+ )
+
+ def process(self, record, key=None):
+ default_key = record['sha1hex']
+
+ fetch_result = self.fetch_blob(record)
+ if fetch_result['status'] != 'success':
+ return fetch_result
+ blob = fetch_result['blob']
+
+ result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['source'] = record
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+
+class GrobidBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like GrobidWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, grobid_client, sink=None, **kwargs):
+ super().__init__()
+ self.grobid_client = grobid_client
+ self.sink = sink
+ self.consolidate_mode = 0
+
+ def process(self, blob, key=None):
+ if not blob:
+ return None
+ result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
new file mode 100644
index 0000000..cd0a8e8
--- /dev/null
+++ b/python/sandcrawler/html.py
@@ -0,0 +1,348 @@
+
+import re
+import sys
+import json
+import urllib.parse
+
+from bs4 import BeautifulSoup
+
+RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"')
+IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
+OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
+SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
+
+
+def extract_fulltext_url(html_url, html_body):
+ """
+ Takes an HTML document (and URL), assumed to be a landing page, and tries
+ to find a fulltext PDF url.
+
+ On error, or if fails to extract a URL, returns an empty dict.
+ """
+
+ host_prefix = '/'.join(html_url.split('/')[:3])
+ try:
+ soup = BeautifulSoup(html_body, 'html.parser')
+ except TypeError as te:
+ print(f"{te} (url={html_url})", file=sys.stderr)
+ return dict()
+ except UnboundLocalError as ule:
+ print(f"{ule} (url={html_url})", file=sys.stderr)
+ return dict()
+
+ ### General Tricks ###
+
+ # highwire-style meta tag
+ meta = soup.find('meta', attrs={"name":"citation_pdf_url"})
+ if not meta:
+ meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
+ if not meta:
+ meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
+ if not meta:
+ # researchgate does this; maybe others also?
+ meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
+ if not meta:
+ meta = soup.find('meta', attrs={"name":"eprints.document_url"})
+ # if tag is only partially populated
+ if meta and not meta.get('content'):
+ meta = None
+ # wiley has a weird almost-blank page we don't want to loop on
+ if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
+ url = meta['content'].strip()
+ if '://doi.org/' in url:
+ print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
+ elif url.startswith('/'):
+ if host_prefix+url == html_url:
+ print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+ else:
+ return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
+ elif url.startswith('http'):
+ if url == html_url:
+ print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+ else:
+ return dict(pdf_url=url, technique='citation_pdf_url')
+ else:
+ print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+
+ meta = soup.find('meta', attrs={"name":"generator"})
+ meta_generator = None
+ if meta and meta.get('content'):
+ meta_generator = meta['content'].strip()
+
+ ### Publisher/Platform Specific ###
+
+ # research square (researchsquare.com)
+ if 'researchsquare.com/article/' in html_url:
+ # JSON in body with a field like:
+ # "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf"
+ m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8'))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4096
+ return dict(release_stage="manuscript", pdf_url=url, technique='publisher')
+
+ # elseiver linking hub
+ # https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975
+ if '://linkinghub.elsevier.com/retrieve/pii/' in html_url:
+ # <input type="hidden" name="redirectURL" value="http%3A%2F%2Fcysticfibrosisjournal.com%2Fretrieve%2Fpii%2FS1569199319308975" id="redirectURL"/>
+ redirect = soup.find("input", attrs={"name": "redirectURL"})
+ if redirect:
+ url = redirect['value'].strip()
+ if 'http' in url:
+ url = urllib.parse.unquote(url)
+ # drop any the query parameter
+ url = url.split('?via')[0]
+ return dict(next_url=url, technique="elsevier-linkinghub")
+
+ # sciencedirect PDF URL extract
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+ if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"):
+ json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
+ url = None
+ if json_tag:
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+ except (KeyError, TypeError, json.JSONDecodeError):
+ pass
+ if url:
+ return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
+ # sciencedirect PDF bounce page
+ # https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
+ if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
+ # window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client';
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode('utf-8'))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4000
+ return dict(pdf_url=url, technique="sciencedirect-bounce")
+
+ # ieeexplore.ieee.org
+ # https://ieeexplore.ieee.org/document/8730316
+ if '://ieeexplore.ieee.org/document/' in html_url:
+ # JSON in body with a field like:
+ # "pdfPath":"/iel7/6287639/8600701/08730316.pdf",
+ m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8'))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4096
+ return dict(release_stage="published", pdf_url=host_prefix+url, technique="ieeexplore")
+ # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
+ if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url:
+ # HTML iframe like:
+ # <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
+ iframe = soup.find("iframe")
+ if iframe and '.pdf' in iframe['src']:
+ return dict(pdf_url=iframe['src'], technique="iframe")
+
+ # https://insights.ovid.com/crossref?an=00042307-202001000-00013
+ # Ovid is some kind of landing page bounce portal tracking run-around.
+ # Can extract actual journal URL from javascript blob in the HTML
+ if '://insights.ovid.com/crossref' in html_url:
+ # var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
+ m = OVID_JOURNAL_URL_REGEX.search(html_body.decode('utf-8'))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4096
+ return dict(next_url=url, technique='ovid')
+
+ # osf.io
+ # https://osf.io/8phvx/
+ # https://osf.io/preprints/socarxiv/8phvx/
+ # wow, they ship total javascript crud! going to just guess download URL
+ # based on URL for now. Maybe content type header would help?
+ OSF_DOMAINS = [
+ '://osf.io/',
+ '://biohackrxiv.org/',
+ '://psyarxiv.com/',
+ '://arabixiv.org/',
+ '://engrxiv.org/',
+ '://edarxiv.org//',
+ '://ecsarxiv.org/',
+ '://ecoevorxiv.org/',
+ '://frenxiv.org/',
+ '://indiarxiv.org/',
+ '://mindrxiv.org/',
+ '://mediarxiv.org/',
+ '://paleorxiv.org/',
+ '://thesiscommons.org/',
+ ]
+ for domain in OSF_DOMAINS:
+ if domain in html_url and (len(html_url.split('/')) in [4,5] or '/preprints/' in html_url) and '/download' not in html_url:
+ if not html_url.endswith("/"):
+ next_url = html_url+"/download"
+ else:
+ next_url = html_url+"download"
+ return dict(next_url=next_url, technique='osf-by-url')
+
+ # wiley
+ # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787
+ if "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
+ if b"/doi/pdfdirect/" in html_body:
+ next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/')
+ return dict(next_url=next_url, technique='wiley-pdfdirect')
+
+ # arxiv abstract pages
+ if "://arxiv.org/abs/" in html_url:
+ url = html_url.replace("/abs/", "/pdf/")
+ return dict(pdf_url=url, technique='arxiv-url')
+
+ # american archivist (OA)
+ # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
+ if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+ # use a more aggressive direct guess to avoid rate-limiting...
+ if "/doi/10." in html_url:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
+ return dict(pdf_url=url, technique='archivist-url')
+ # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
+ hrefs = soup.find_all('a', attrs={"target":"_blank"})
+ for href in hrefs:
+ url = href['href'].strip()
+ if "/doi/pdf/" in url:
+ if url.startswith('http'):
+ return dict(pdf_url=url, technique='publisher-href')
+ elif url.startswith('/'):
+ return dict(pdf_url=host_prefix+url, technique='publisher-href')
+
+ # protocols.io
+ # https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
+ if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"):
+ url = html_url + ".pdf"
+ return dict(pdf_url=url, technique='protocolsio-url')
+
+ # degruyter.com
+ # https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml
+ if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"):
+ url = html_url.replace('/view/', '/downloadpdf/').replace('.xml', '.pdf')
+ return dict(pdf_url=url, technique='degruyter-url')
+
+ # journals.lww.com (Wolters Kluwer)
+ # https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx
+ # DISABLED: they seem to redirect our crawler back to a "Fulltext" page and
+ # we never get the content.
+ if "://journals.lww.com/" in html_url and False:
+ # data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+ for line in html_body.split(b'\n'):
+ if b"data-pdf-url=" in line:
+ line = line.decode('utf-8')
+ url = line.strip().replace('data-pdf-url=', '').replace('"', '')
+ if url.startswith('http') and 'pdfs.journals.lww.com' in url:
+ return dict(pdf_url=url, technique='journals.lww.com-jsvar')
+
+ # www.ahajournals.org
+ # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
+ if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+ # <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
+ if b'/doi/pdf/10.' in html_body:
+ url = html_url.replace('/doi/10.', '/doi/pdf/10.')
+ url = url + "?download=true"
+ return dict(pdf_url=url, technique='ahajournals-url')
+
+ # ehp.niehs.nih.gov
+ # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
+ # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
+ if "://ehp.niehs.nih.gov/doi/" in html_url:
+ # <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
+ if b'/doi/pdf/10.' in html_body:
+ url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
+ return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
+
+ # cogentoa.com
+ # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
+ if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+ # blech, it's a SPA! All JS
+ # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
+ url = html_url + ".pdf"
+ return dict(pdf_url=url, technique='cogentoa-url')
+
+ # chemrxiv.org (likely to be other figshare domains also)
+ # https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419
+ if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url:
+ # <script id="app-data" type="text/json"> [...] </script>
+ json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"})
+ if json_tag and json_tag.string:
+ app_data = json.loads(json_tag.string)
+ # "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf"
+ url = app_data.get('article', {}).get('exportPdfDownloadUrl')
+ if url and url.startswith('http'):
+ return dict(pdf_url=url, technique='figshare-json')
+
+ # CNKI COVID-19 landing pages
+ # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
+ if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
+ # <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA; "><i></i>PDF Download</a>
+ href = soup.find('a', attrs={"id":"pdfDown"})
+ if href:
+ url = href['href'].strip().replace('&#xA;', '')
+ if not url.startswith('http'):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique='cnki-href')
+
+ # RWTH AACHEN repository
+ if '://publications.rwth-aachen.de/record/' in html_url:
+ record_id = html_url.split('/')[-1]
+ url = f"{html_url}/files/{record_id}.pdf"
+ if record_id.isdigit() and url.encode('utf-8') in html_body:
+ return dict(pdf_url=url, technique='rwth-aachen-url')
+
+ # physchemaspects.ru
+ if '://physchemaspects.ru/' in html_url and soup:
+ for href in soup.find_all('a'):
+ if href.text == "download PDF file":
+ url = href['href']
+ if url.startswith('/'):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique='physchemaspects-href')
+
+ # OJS 3 (some)
+ if meta_generator and meta_generator.startswith("Open Journal Systems"):
+ href = soup.find('a', attrs={"class":"obj_galley_link file"})
+ if href and href.text and "pdf" in href.text.lower():
+ url = href['href'].strip()
+ if url.startswith('/'):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique='ojs-galley-href')
+
+ # ETH zurich e-periodica
+ if '://www.e-periodica.ch/digbib/view' in html_url:
+ url = html_url.replace('digbib/view', 'cntmng').split('#')[0]
+ if url.encode('utf-8') in html_body:
+ return dict(pdf_url=url, technique='href-eperiodica')
+
+ # JMIR
+ # https://mhealth.jmir.org/2020/7/e17891/
+ if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"):
+ url = html_url + "pdf"
+ return dict(pdf_url=url, technique='jmir-url')
+
+ ### below here we are doing guesses
+
+ # generic guess: try current URL plus .pdf, if it exists in the HTML body
+ if not '.pdf' in html_url:
+ url = html_url + ".pdf"
+ if url.encode('utf-8') in html_body:
+ return dict(pdf_url=url, technique='guess-url-plus-pdf')
+
+ return dict()
+
+def test_regex():
+ lines = """
+ blah
+ var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
+ asdf"""
+ m = OVID_JOURNAL_URL_REGEX.search(lines)
+ assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+
+ lines = """
+ window.onload = function () {
+ window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client';
+ refreshOriginalWindow();
+ }
+ """
+ url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+ assert m.group(1) == url
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
new file mode 100644
index 0000000..f11cac4
--- /dev/null
+++ b/python/sandcrawler/html_ingest.py
@@ -0,0 +1,441 @@
+
+import io
+import sys
+import json
+import datetime
+import argparse
+import xml.etree.ElementTree as ET
+from typing import List, Optional, Any, Tuple
+
+import trafilatura
+import pydantic
+from selectolax.parser import HTMLParser
+
+from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
+from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+
+
+TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
+def html_extract_body_teixml(doc: bytes) -> dict:
+ try:
+ tei_xml = trafilatura.extract(doc,
+ tei_output=True,
+ include_comments=False,
+ include_formatting=True,
+ )
+ except (ValueError, TypeError, Exception) as e:
+ return dict(
+ status="trafilatura-parse-error",
+ error_msg=str(e)[:1000],
+ )
+ if tei_xml:
+ body_txt = teixml_body_text(tei_xml)
+ word_count = len(body_txt.split())
+ return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count)
+ elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+ # hack for firstmonday.org
+ return html_extract_body_teixml(doc[106:])
+ else:
+ return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
+
+def teixml_body_text(doc_xml: str) -> str:
+ ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+ tree = ET.fromstring(doc_xml)
+ body = tree.find('.//tei:body', ns)
+ if body:
+ return " ".join(body.itertext())
+ else:
+ return ""
+
+class WebResource(pydantic.BaseModel):
+ surt: str
+ timestamp: datetime.datetime
+ url: str
+ sha1hex: str
+ mimetype: str
+ status_code: int
+ size: Optional[int]
+ sha256hex: Optional[str]
+ resource_type: Optional[str]
+
+ class Config:
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat()
+ }
+
+class IngestWebResult(pydantic.BaseModel):
+ status: str
+ hit: bool
+ error_message: Optional[str]
+ cdx: Optional[dict]
+ terminal: Optional[Any] # TODO
+ request: Optional[Any] # TODO
+ file_meta: Optional[dict]
+ html_biblio: Optional[BiblioMetadata]
+ scope: Optional[str]
+ html_body: Optional[dict]
+ html_resources: Optional[List[WebResource]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+class HtmlMetaRow(pydantic.BaseModel):
+ sha1hex: str
+ status: str
+ scope: Optional[str]
+ has_teixml: bool
+ has_thumbnail: bool
+ word_count: Optional[int]
+ biblio: Optional[dict]
+ resources: Optional[List[dict]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+ def to_sql_tuple(self) -> Tuple:
+ """
+ This is for the html_meta SQL table.
+ """
+ return (
+ self.sha1hex,
+ datetime.datetime.now(), # updated
+ self.status,
+ self.scope,
+ self.has_teixml,
+ self.has_thumbnail,
+ self.word_count,
+ (self.biblio or None) and json.dumps(self.biblio, sort_keys=True),
+ (self.resources or None) and json.dumps(self.resources, sort_keys=True),
+ )
+
+
+def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+ """
+ This is the lazy version that just does a CDX lookup for each resource.
+
+ Takes a list instead of single record because we may want to circuit break
+ on failure, and may introduce concurrency internal to this function.
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
+ if not cdx_row:
+ raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
+ if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
+ print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+ if not cdx_row.status_code:
+ # TODO: fall back to a full fetch?
+ print(f" WARN: skipping revisit record", file=sys.stderr)
+ continue
+ full.append(WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource['type'],
+ ))
+
+ return full
+
+
+def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+ """
+ This is the full version which fetches each resource from wayback/petabox
+ and calculates additional hashes.
+
+ Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
+ if not wayback_resp or wayback_resp.status != 'success':
+ raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
+ file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
+ raise WaybackContentError(f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}")
+ full.append(WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta['sha1hex'],
+ mimetype=file_meta['mimetype'],
+ status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code,
+ size=file_meta['size_bytes'],
+ sha256hex=file_meta['sha256hex'],
+ resource_type=resource['type'],
+ ))
+
+ return full
+
+
+def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+
+ generator: Optional[str] = None
+ generator_elem = doc.css_first("meta[name='generator']")
+ if generator_elem:
+ generator = generator_elem.attrs['content']
+ else:
+ generator_elem = doc.css_first("a[id='developedBy']")
+ if generator_elem:
+ generator = generator_elem.text()
+ if generator and "open journal systems 3" in generator.lower():
+ return "ojs3"
+ elif generator and "open journal systems" in generator.lower():
+ return "ojs"
+ elif generator and "plone" in generator.lower():
+ return "plone"
+ elif generator and "wordpress" in generator.lower():
+ return "wordpress"
+ elif generator and "blogger" in generator.lower():
+ return "blogger"
+ elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
+ return "ojs"
+ else:
+ try:
+ if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+ return "ojs"
+ if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
+ return "arpha"
+ if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
+ return "galenos"
+ except UnicodeDecodeError:
+ pass
+
+ icon_elem = doc.css_first("link[type='image/x-icon']")
+ if icon_elem and 'href' in icon_elem.attrs:
+ if 'journalssystem.com' in icon_elem.attrs['href']:
+ return "journalssystem.com"
+ elif 'indexcopernicus.com' in icon_elem.attrs['href']:
+ return "indexcopernicus"
+
+ if 'scielo' in url:
+ return "scielo"
+
+ return None
+
+def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
+ """
+ This function tries to guess if an HTML document represents one of:
+
+ - article-fulltext
+ - article-abstract
+ - article-sample
+ - supplement
+ - component
+ - issue-fulltext
+ - landingpage
+ - homepage-domain
+ - blocked-paywall
+ - blocked-login
+ - blocked-captcha
+ - blocked-cookie
+ - errorpage
+ - stub
+ - other
+ - unknown
+
+ Unknown implies the page could be anything. "other" implies it is not
+ fulltext or a landing page, but could be one of the other categories.
+ """
+
+ # assert that this is a real URL
+ assert url.count('/') >= 2
+
+ # basic paywall and loginwall detection based on URL
+ if url.endswith("/cookieAbsent"):
+ return "blocked-cookie"
+ if "://page-one.live.cf.public.springer.com" in url:
+ return "article-sample"
+
+ if "scielo" in url:
+ if "sci_abstract" in url:
+ return "landingpage"
+ if "sci_arttext" in url:
+ return "article-fulltext"
+
+ if "showcaptcha.asp" in url:
+ return "blocked-captcha"
+
+ # is this the top-level URL of the domain? aka, no path?
+ if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'):
+ return "homepage-domain"
+
+ platform = html_guess_platform(url, doc, biblio)
+
+ if biblio:
+ if biblio.html_fulltext_url:
+ if url_fuzzy_equal(biblio.html_fulltext_url, url):
+ return "article-fulltext"
+ else:
+ return "landingpage"
+
+ # platform-specific detection
+ if platform in ("ojs", "ojs3"):
+
+ if biblio and biblio.title:
+ if word_count and word_count > 1200:
+ return "fulltext"
+ else:
+ return "landingpage"
+ else:
+ if "/article/view/" in url and word_count and word_count > 600:
+ return "fulltext"
+ return "other"
+ elif platform == "journalssystem.com":
+ if biblio and biblio.pdf_fulltext_url and word_count and word_count < 1000:
+ return "landingpage"
+
+ # more platform/publisher specific checks
+ if "karger.com/Article/Abstract" in url:
+ return "landingpage"
+ if "dergipark.gov.tr" in url and not ("download/article-file" in url):
+ return "other"
+
+ try:
+ if isinstance(doc.html, str) and "<center><h1>403 Forbidden</h1></center>" in doc.html:
+ # cloudflare block pattern
+ return "blocked-forbidden"
+ except UnicodeDecodeError:
+ pass
+
+ print(f" scope guessing: platform {platform} word count: {word_count}", file=sys.stderr)
+
+ # fallback: guess based on word count (arbitrary guesses here)
+ if word_count is not None:
+ if word_count < 20:
+ return "stub"
+ elif word_count > 500 and platform in ['wordpress', 'blogger']:
+ return "article-fulltext"
+ elif word_count > 1200:
+ return "article-fulltext"
+
+ return "unknown"
+
+
+def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
+
+ adblock = load_adblock_rules()
+ wayback_client = WaybackClient()
+
+ html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp)
+ if html_resource.status != "success":
+ return IngestWebResult(
+ status=html_resource.status,
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ )
+
+ assert html_resource.terminal_status_code == 200
+
+ file_meta = gen_file_metadata(html_resource.body)
+ file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
+
+ if file_meta['mimetype'] not in ("text/html", "text/xml"):
+ return IngestWebResult(
+ status="wrong-mimetype",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ )
+
+ html_doc = HTMLParser(html_resource.body)
+ html_biblio = html_extract_biblio(url, html_doc)
+ html_body = html_extract_body_teixml(html_resource.body)
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count'))
+ if html_scope not in ('article-fulltext', 'unknown'):
+ return IngestWebResult(
+ status="wrong-scope",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ )
+
+ raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+ assert len(raw_resources) <= 200
+
+ when = parse_cdx_datetime(html_resource.cdx.datetime)
+
+ full_resources: List[WebResource] = []
+ if quick_mode:
+ full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+ else:
+ full_resources = fetch_html_resources(raw_resources, wayback_client, when)
+
+ output = IngestWebResult(
+ status="success",
+ hit=True,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_body=html_body,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ html_resources=full_resources,
+ )
+ return output
+
+
+def main() -> None:
+ """
+ Run this command like:
+
+ python -m sandcrawler.html_ingest
+ """
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ subparsers = parser.add_subparsers()
+
+ sub = subparsers.add_parser(
+ "single", help="tries to ingest a single URL, dumps result to stdout"
+ )
+ sub.set_defaults(func="run_single")
+ sub.add_argument(
+ "url",
+ help="URL to fetch",
+ type=str,
+ )
+ sub.add_argument(
+ "--timestamp",
+ help="timestamp for which to fetch document from wayback",
+ type=str,
+ )
+ sub.add_argument(
+ "--quick-mode",
+ help="don't fetch resources, only do CDX lookup",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ if args.func == "run_single":
+ result = run_single(args.url, args.timestamp, args.quick_mode)
+ print(result.json(indent=2, exclude_none=True))
+ else:
+ #func = getattr(wp, args.func)
+ #func()
+ raise NotImplementedError()
+
+if __name__ == "__main__":
+ main()
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
new file mode 100644
index 0000000..1a328ef
--- /dev/null
+++ b/python/sandcrawler/html_metadata.py
@@ -0,0 +1,857 @@
+
+import sys
+import datetime
+from typing import List, Optional, Any, Tuple, Dict
+import urllib.parse
+
+import dateparser
+from selectolax.parser import HTMLParser
+import pydantic
+import braveblock
+
+from sandcrawler.misc import url_fuzzy_equal
+
+
+# this is a map of metadata keys to CSS selectors
+# sources for this list include:
+# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
+# - inspection of actual publisher HTML
+# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata
+# - "HTML meta tags used by journal articles"
+# https://gist.github.com/hubgit/5985963
+# order of these are mostly by preference/quality (best option first), though
+# also/sometimes re-ordered for lookup efficiency (lookup stops after first
+# match)
+HEAD_META_PATTERNS: Any = {
+ "title": [
+ "meta[name='citation_title']",
+ "meta[name='eprints.title']",
+ "meta[name='prism.title']",
+ "meta[name='bepress_citation_title']",
+ "meta[name='og:title']",
+ "meta[name='dcterms.title']",
+ "meta[name='dc.title']",
+ ],
+ "subtitle": [
+ "meta[name='prism.subtitle']",
+ ],
+ "doi": [
+ "meta[name='citation_doi']",
+ "meta[name='DOI']",
+ "meta[id='DOI']",
+ "meta[name='prism.doi']",
+ "meta[name='bepress_citation_doi']",
+ "meta[name='dc.identifier.doi']",
+ "meta[name='dc.identifier'][scheme='doi']",
+ ],
+ "pmid": [
+ "meta[name='citation_pmid']",
+ ],
+ "abstract": [
+ "meta[name='citation_abstract']",
+ "meta[name='bepress_citation_abstract']",
+ "meta[name='eprints.abstract']",
+ "meta[name='dcterms.abstract']",
+ "meta[name='prism.teaser']",
+ "meta[name='dc.description']",
+ "meta[name='og:description']",
+ ],
+ "container_name": [
+ "meta[name='citation_journal_title']",
+ "meta[name='bepress_citation_journal_title']",
+ "meta[name='citation_conference_title']",
+ "meta[name='bepress_citation_conference_title']",
+ "meta[name='prism.publicationName']",
+ "meta[name='eprints.publication']",
+ "meta[name='dc.relation.ispartof']",
+ "meta[name='dc.source']",
+ "meta[property='og:site_name']",
+ ],
+ "container_abbrev": [
+ "meta[name='citation_journal_abbrev']",
+ ],
+ "raw_date": [
+ "meta[name='citation_publication_date']",
+ "meta[name='bepress_citation_publication_date']",
+ "meta[name='prism.publicationDate']",
+ "meta[name='citation_date']",
+ "meta[name='bepress_citation_date']",
+ "meta[name='citation_online_date']",
+ "meta[name='bepress_citation_online_date']",
+ "meta[itemprop='datePublished']",
+ "meta[name='article:published']",
+ "meta[name='eprints.datestamp']",
+ "meta[name='eprints.date']",
+ "meta[name='dc.date.created']",
+ "meta[name='dc.issued']",
+ "meta[name='dcterms.date']",
+ "meta[name='dc.date']",
+ ],
+ "release_year": [
+ "meta[itemprop='citation_year']",
+ "meta[itemprop='prism:copyrightYear']",
+ ],
+ "first_page": [
+ "meta[name='citation_firstpage']",
+ "meta[name='bepress_citation_firstpage']",
+ "meta[name='prism.startingPage']",
+ "meta[name='dc.citation.spage']",
+ ],
+ "last_page": [
+ "meta[name='citation_lastpage']",
+ "meta[name='bepress_citation_lastpage']",
+ "meta[name='prism.endingPage']",
+ "meta[name='dc.citation.epage']",
+ ],
+ "issue": [
+ "meta[name='citation_issue']",
+ "meta[name='bepress_citation_issue']",
+ "meta[name='prism.issueIdentifier']",
+ "meta[name='dc.citation.issue']",
+ ],
+ "volume": [
+ "meta[name='citation_volume']",
+ "meta[name='bepress_citation_volume']",
+ "meta[name='prism.volume']",
+ "meta[name='dc.citation.volume']",
+ ],
+ "number": [
+ "meta[name='citation_technical_report_number']",
+ "meta[name='bepress_citation_technical_report_number']",
+ "meta[name='citation_number']",
+ "meta[name='bepress_citation_number']",
+ "meta[name='prism.number']",
+ ],
+ "container_issn": [
+ "meta[name='citation_issn']",
+ "meta[name='bepress_citation_issn']",
+ "meta[name='prism.issn']",
+ "meta[name='prism.eIssn']",
+ "meta[name='eprints.issn']",
+ "meta[name='dc.source.issn']",
+ ],
+ "isbn": [
+ "meta[name='citation_isbn']",
+ "meta[name='bepress_citation_isbn']",
+ "meta[name='prism.isbn']",
+ ],
+ "publisher": [
+ "meta[name='citation_publisher']",
+ "meta[name='bepress_citation_publisher']",
+ "meta[name='eprints.publisher']",
+ "meta[name='citation_technical_report_institution']",
+ "meta[name='dcterms.publisher']",
+ "meta[name='dc.publisher']",
+ ],
+ "raw_release_type": [
+ "meta[name='citation_article_type']",
+ "meta[name='bepress_citation_article_type']",
+ "meta[name='prism.contentType']",
+ "meta[name='eprints.type']",
+ "meta[name='dc.type']",
+ ],
+ "lang": [
+ "meta[name='citation_language']",
+ "meta[name='bepress_citation_language']",
+ "meta[name='dcterms.language']",
+ "meta[name='dc.language']",
+ "meta[name='og:locale']",
+ ],
+}
+
+HEAD_META_LIST_PATTERNS: Any = {
+ "contrib_names": [
+ "meta[name='citation_author']",
+ "meta[name='bepress_citation_author']",
+ "meta[name='eprints.creators_name']",
+ "meta[name='dcterms.creator']",
+ "meta[name='article:author']",
+ "meta[name='dc.creator']",
+ "meta[name='dc.contributor']",
+ ],
+ # TODO: citation_author_institution
+ "raw_references": [
+ "meta[name='citation_reference']",
+ ],
+ "raw_identifiers": [
+ "meta[name='eprints.id_number']",
+ "meta[name='dcterms.identifier']",
+ "meta[name='dc.identifier']",
+ ],
+}
+
+XML_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "meta[name='citation_xml_url']",
+ "attr": "content",
+ "technique": "citation_xml_url",
+ },
+ {
+ "selector": "meta[name='fulltext_xml']",
+ "attr": "content",
+ "technique": "fulltext_xml",
+ },
+ {
+ "selector": "link[rel='alternate'][type='application/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "selector": "link[rel='alternate'][type='text/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "scielo",
+ "in_fulltext_url": "articleXML",
+ "selector": "a[target='xml']",
+ "attr": "href",
+ "technique": "SciElo XML link",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "viewXML",
+ "selector": "a[class='obj_galley_link']",
+ "attr": "href",
+ "technique": "OJS Gallery XML link",
+ },
+ {
+ "in_fulltext_url": "/download/xml/",
+ "selector": "a[title='XML']",
+ "attr": "href",
+ "technique": "ARPHA XML link",
+ "example_page": "https://zookeys.pensoft.net/article/26391",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "xml",
+ "selector": "a.download-files-nlm",
+ "attr": "href",
+ "technique": "XML (NLM) download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+]
+
+HTML_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "meta[name='citation_fulltext_html_url']",
+ "attr": "content",
+ "technique": "citation_fulltext_html_url",
+ },
+ {
+ "selector": "link[rel='alternate'][type='text/html']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "inline=1",
+ "selector": "iframe[name='htmlFrame']",
+ "attr": "src",
+ "technique": "OJS HTML iframe",
+ },
+ {
+ "in_doc_url": "dovepress.com",
+ "in_fulltext_url": "-fulltext-",
+ "selector": "a[id='view-full-text']",
+ "attr": "href",
+ "technique": "dovepress fulltext link",
+ },
+]
+
+COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "in_doc_url": "pensoft.net/article/", # also /element/
+ "in_fulltext_url": "/download/fig/",
+ "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+ "attr": "href",
+ "technique": "Active figure download link (zookeys)",
+ "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+ },
+]
+
+# This is a database of matching patterns. Most of these discovered by hand,
+# looking at OA journal content that failed to craw/ingest.
+PDF_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "head meta[name='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "selector": "head meta[name='bepress_citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "in_doc_url": "journals.lww.com",
+ "selector": "head meta[name='wkhealth_pdf_url']",
+ "attr": "content",
+ "technique": "wkhealth_pdf_url",
+ "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
+ },
+ {
+ "selector": "head meta[propery='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ # eg, researchgate
+ },
+ {
+ "selector": "head meta[name='eprints.document_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url (property)",
+ },
+ {
+ "in_doc_url": "/doi/10.",
+ "in_fulltext_url": "/doi/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "SAGE/UTP show-pdflink",
+ "example_page": "https://journals.sagepub.com/doi/10.1177/2309499019888836",
+ # also http://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05
+ },
+ {
+ "in_doc_url": "/doi/10.",
+ "in_fulltext_url": "/doi/pdf/",
+ "selector": "a[title='PDF']",
+ "attr": "href",
+ "technique": "title=PDF link",
+ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "selector": "a#pdfDownloadLink",
+ "attr": "href",
+ "technique": "pdfDownloadLink link",
+ "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
+ },
+ {
+ "in_fulltext_url": "/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "SAGE PDF link",
+ "example_page": "http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836",
+ },
+ {
+ "in_doc_url": "://elifesciences.org/articles/",
+ "in_fulltext_url": "/download/",
+ "selector": "a[data-download-type='pdf-article']",
+ "attr": "href",
+ "technique": "eLife PDF link",
+ "example_page": "https://elifesciences.org/articles/59841",
+ },
+ {
+ "in_doc_url": "://www.jcancer.org/",
+ "in_fulltext_url": ".pdf",
+ "selector": ".divboxright a.text-button",
+ "attr": "href",
+ "technique": "jcancer PDF link",
+ "example_page": "https://www.jcancer.org/v10p4038.htm",
+ },
+ {
+ "in_doc_url": "://www.tandfonline.com/doi/full/10.",
+ "in_fulltext_url": "/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "t+f show-pdf link",
+ "example_page": "https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234",
+ },
+ {
+ "in_doc_url": "article_id=",
+ "in_fulltext_url": "download.php",
+ "selector": "a.file.pdf",
+ "attr": "href",
+ "technique": "pdf file link",
+ "example_page": "http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405",
+ },
+ {
+ "in_doc_url": "/content/10.",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[title='Download']",
+ "attr": "href",
+ "technique": "pdf file link",
+ "example_page": "https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230",
+ },
+ {
+ "selector": "embed[type='application/pdf']",
+ "attr": "src",
+ "technique": "PDF embed",
+ "example_page": "http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401",
+ },
+ {
+ "in_doc_url": "/html/",
+ "in_fulltext_url": "create_pdf",
+ "selector": ".AbsPdfFigTab img[src='images/pdf-icon.jpg'] + a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.aed.org.cn/nyzyyhjxb/html/2018/4/20180408.htm",
+ },
+ {
+ "in_doc_url": "/archive-detail/",
+ "in_fulltext_url": ".pdf",
+ "selector": ".contact-list a.download-pdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
+ },
+ {
+ "in_doc_url": "degruyter.com/document/",
+ "in_fulltext_url": "/pdf",
+ "selector": "a.downloadPdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+ },
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
+ {
+ "in_doc_url": "library.wur.nl/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.wl_full_text_restricted",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922",
+ },
+ {
+ "in_doc_url": "/dlibra/",
+ "in_fulltext_url": "pdf",
+ "selector": "iframe#js-main-frame",
+ "attr": "src",
+ "technique": "PDF iframe (dlibra)",
+ "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031",
+ },
+ {
+ "in_doc_url": "/handle/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.misc table.inner tr.b a",
+ "attr": "href",
+ "technique": "PDF URL link (DSpace, first file)",
+ "example_page": "https://orbi.uliege.be/handle/2268/174200",
+ },
+ {
+ "in_doc_url": "/publications/",
+ "in_fulltext_url": "pdf",
+ "selector": ".publication-sidebar li.open-access a.document-link",
+ "attr": "href",
+ "technique": "PDF URL link (Pure repo, OA link)",
+ "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance",
+ },
+ {
+ "in_doc_url": "//hal",
+ "selector": ".widget-openaccess .widget-content a",
+ "attr": "href",
+ "technique": "Fulltext OA URL (HAL)",
+ "example_page": "https://hal.archives-ouvertes.fr/hal-00744951",
+ },
+ {
+ "in_doc_url": "/record/",
+ "in_fulltext_url": "pdf",
+ "selector": "#detailedrecordminipanelfile a",
+ "attr": "href",
+ "technique": "PDF URL link (Invenio)",
+ "example_page": "https://bib-pubdb1.desy.de/record/416556",
+ },
+ {
+ "in_doc_url": "/available/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.file-table a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/",
+ },
+ {
+ "in_doc_url": "/islandora/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.islandora-pdf-link",
+ "attr": "href",
+ "technique": "PDF URL link (Islandora)",
+ "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804",
+ },
+ {
+ "in_doc_url": "/receive/",
+ "in_fulltext_url": "pdf",
+ "selector": ".mir-preview noscript a",
+ "attr": "href",
+ "technique": "PDF iframe via noscript (MyCoRe)",
+ "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191",
+ },
+ {
+ "in_doc_url": "/registro.do",
+ "in_fulltext_url": "imagenes",
+ "selector": ".resumen_bib a[data-analytics=media]",
+ "attr": "href",
+ "technique": "Media link (DIGIBIS)",
+ "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740",
+ },
+ {
+ "in_doc_url": "/view",
+ "in_fulltext_url": "/at_download/",
+ "selector": ".documentContent #content a",
+ "attr": "href",
+ "technique": "Media link (Plone)",
+ "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
+ },
+ {
+ "in_doc_url": "isca-speech.org/",
+ "in_fulltext_url": "pdf",
+ "selector": ".w3-container a",
+ "attr": "href",
+ "technique": "PDF URL link (isca-speech.org)",
+ "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+ },
+ {
+ "in_doc_url": "://repository.dri.ie/",
+ "in_fulltext_url": "/download",
+ "selector": "#dri_download_assets > div > a",
+ "attr": "href",
+ "technique": "Download link (repository.dri.ie)",
+ "example_page": "https://repository.dri.ie/catalog/qf8621102",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.download-files-pdf",
+ "attr": "href",
+ "technique": "PDF Download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+ {
+ "in_doc_url": "cureus.com/",
+ "in_fulltext_url": "pdf",
+ "selector": ".small-medium-pdf a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF Download link (cureus.com)",
+ "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+ },
+ {
+ "in_doc_url": "e-manuscripta.ch/",
+ "in_fulltext_url": "pdf",
+ "selector": "#titleinfoPdfDownload a.resourceLink",
+ "attr": "href",
+ "technique": "PDF Download link (e-manuscripta.ch)",
+ "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+ },
+]
+
+FULLTEXT_URL_PATTERNS_SKIP = [
+ # wiley has a weird almost-blank page we don't want to loop on
+ "://onlinelibrary.wiley.com/doi/pdf/"
+ "://doi.org/"
+ "://dx.doi.org/"
+]
+
+RELEASE_TYPE_MAP = {
+ "research article": "article-journal",
+ "text.serial.journal": "article-journal",
+}
+
+
+class BiblioMetadata(pydantic.BaseModel):
+ title: Optional[str]
+ subtitle: Optional[str]
+ contrib_names: Optional[List[str]]
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+ lang: Optional[str]
+ country_code: Optional[str]
+ volume: Optional[str]
+ issue: Optional[str]
+ number: Optional[str]
+ pages: Optional[str]
+ first_page: Optional[str]
+ last_page: Optional[str]
+ license: Optional[str]
+ publisher: Optional[str]
+ container_name: Optional[str]
+ container_abbrev: Optional[str]
+ container_issn: Optional[str]
+ container_type: Optional[str]
+ raw_references: Optional[List[str]]
+
+ doi: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ publisher_ident: Optional[str]
+ oai_id: Optional[str]
+
+ abstract: Optional[str]
+ pdf_fulltext_url: Optional[str]
+ html_fulltext_url: Optional[str]
+ xml_fulltext_url: Optional[str]
+ component_url: Optional[str]
+
+ class Config:
+ json_encoders = {
+ datetime.date: lambda dt: dt.isoformat()
+ }
+
+
+def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+ """
+ Tries to quickly extract fulltext URLs using a set of patterns. This
+ function is intendend to be generic across various extraction techniques.
+
+ Returns null or a tuple of (url, technique)
+ """
+ self_doc_url: Optional[Tuple[str, str]] = None
+ for pattern in patterns:
+ if not 'selector' in pattern:
+ continue
+ if 'in_doc_url' in pattern:
+ if not pattern['in_doc_url'] in doc_url:
+ continue
+ elem = doc.css_first(pattern['selector'])
+ if not elem:
+ continue
+ if 'attr' in pattern:
+ val = elem.attrs.get(pattern['attr'])
+ if not val:
+ continue
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ if 'in_fulltext_url' in pattern:
+ if not pattern['in_fulltext_url'] in val:
+ continue
+ for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+ if skip_pattern in val.lower():
+ continue
+ if url_fuzzy_equal(doc_url, val):
+ # don't link to self, unless no other options
+ self_doc_url = (val, pattern.get('technique', 'unknown'))
+ continue
+ return (val, pattern.get('technique', 'unknown'))
+ if self_doc_url:
+ print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ return self_doc_url
+ return None
+
+def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
+
+ meta: Any = dict()
+ head = doc.css_first("head")
+ if not head:
+ return None
+
+ for field, patterns in HEAD_META_PATTERNS.items():
+ for pattern in patterns:
+ val = head.css_first(pattern)
+ #print((field, pattern, val))
+ if val and 'content' in val.attrs and val.attrs['content']:
+ meta[field] = val.attrs['content']
+ break
+
+ for field, patterns in HEAD_META_LIST_PATTERNS.items():
+ for pattern in patterns:
+ val_list = head.css(pattern)
+ if val_list:
+ for val in val_list:
+ if 'content' in val.attrs and val.attrs['content']:
+ if not field in meta:
+ meta[field] = []
+ meta[field].append(val.attrs['content'])
+ break
+
+ # (some) fulltext extractions
+ pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
+ if pdf_fulltext_url:
+ meta['pdf_fulltext_url'] = pdf_fulltext_url[0]
+ xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
+ if xml_fulltext_url:
+ meta['xml_fulltext_url'] = xml_fulltext_url[0]
+ html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
+ if html_fulltext_url:
+ meta['html_fulltext_url'] = html_fulltext_url[0]
+ component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+ if component_url:
+ meta['component_url'] = component_url[0]
+
+ # TODO: replace with clean_doi() et al
+ if meta.get('doi') and meta.get('doi').startswith('doi:'):
+ meta['doi'] = meta['doi'][4:]
+
+ raw_identifiers = meta.pop('raw_identifiers', [])
+ for ident in raw_identifiers:
+ if ident.startswith('doi:10.'):
+ if not 'doi' in meta:
+ meta['doi'] = ident.replace('doi:', '')
+ elif ident.startswith('10.') and '/' in ident:
+ if not 'doi' in meta:
+ meta['doi'] = ident
+ elif ident.startswith('isbn:'):
+ if not 'isbn' in meta:
+ meta['isbn'] = ident.replace('isbn:', '')
+
+ raw_date = meta.pop('raw_date', None)
+ if raw_date:
+ parsed = dateparser.parse(raw_date)
+ if parsed:
+ meta['release_date'] = parsed.date()
+
+ raw_release_type = meta.pop('raw_release_type', None)
+ if raw_release_type:
+ release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
+ if release_type:
+ meta['release_type'] = release_type
+
+ return BiblioMetadata(**meta)
+
+def load_adblock_rules() -> braveblock.Adblocker:
+ """
+ TODO: consider blocking very generic assets:
+ - ://fonts.googleapis.com/css*
+ - ://journals.plos.org/plosone/resource/img/icon.*
+ """
+ return braveblock.Adblocker(
+ include_easylist=True,
+ include_easyprivacy=True,
+ rules=[
+ "/favicon.ico^",
+ "||fonts.googleapis.com^",
+ "||widgets.figshare.com^",
+ "||crossmark-cdn.crossref.org^",
+ "||crossmark.crossref.org^",
+ "||platform.twitter.com^",
+ "||verify.nature.com^",
+ "||s7.addthis.com^",
+ "||www.mendeley.com^",
+ "||pbs.twimg.com^",
+ "||badge.dimensions.ai^",
+ "||recaptcha.net^",
+
+ # not sure about these CC badges (usually via a redirect)
+ #"||licensebuttons.net^",
+ #"||i.creativecommons.org^",
+
+ # Should we skip jquery, or other generic javascript CDNs?
+ #"||code.jquery.com^",
+ #"||ajax.googleapis.com^",
+ #"||cdnjs.cloudflare.com^",
+
+ # badges, "share" buttons, tracking, etc
+ "apis.google.com/js/plusone",
+ "www.google.com/recaptcha/",
+ "js/_getUACode.js"
+
+ # PLOS images
+ "/resource/img/icon.*.16.png^",
+ ],
+ )
+
+
+def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+ resources = []
+
+ for node in doc.css(selector):
+ for attr in attrs:
+ if not attr in node.attrs:
+ continue
+ url = node.attrs.get(attr)
+ # special-case a couple meta URI prefixes which don't match with adblock rules
+ skip = False
+ for prefix in ['about:', 'data:', 'magnet:', 'urn:', 'mailto:']:
+ if url and url.startswith(prefix):
+ skip = True
+ break
+ if skip:
+ continue
+ if url:
+ #print(url, file=sys.stderr)
+ resources.append(dict(url=url.strip(), type=type_name))
+
+ return resources
+
+
+def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list:
+ """
+ This function tries to find all the important resources in a page. The
+ presumption is that the HTML document is article fulltext, and we want the
+ list of all resoures (by URL) necessary to replay the page.
+
+ The returned resource URLs each have a type (script, img, css, etc), and
+ should be fully-qualified URLs (not relative).
+
+ Adblock filtering is run to remove unwanted resources.
+ """
+ resources = []
+
+ # select various resource references
+ resources += _extract_generic(doc, "script", ["src"], "script")
+ resources += _extract_generic(doc, "link[rel='stylesheet']", ["href"], "stylesheet")
+ # TODO: srcset and parse
+ # eg: https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w
+ resources += _extract_generic(doc, "img", ["src"], "image")
+ resources += _extract_generic(doc, "audio", ["src"], "audio")
+ resources += _extract_generic(doc, "video", ["src"], "media")
+ resources += _extract_generic(doc, "source", ["src"], "media")
+ resources += _extract_generic(doc, "track", ["src"], "media")
+ resources += _extract_generic(doc, "iframe", ["src"], "subdocument")
+ resources += _extract_generic(doc, "embed", ["src"], "media")
+
+ # ensure URLs are absolute
+ for r in resources:
+ r['url'] = urllib.parse.urljoin(doc_url, r['url'])
+
+ # filter using adblocker
+ resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False]
+
+ # remove duplicates
+ resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
+
+ return resources
+
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
new file mode 100644
index 0000000..c586972
--- /dev/null
+++ b/python/sandcrawler/ia.py
@@ -0,0 +1,1138 @@
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import time
+import gzip
+import json
+import requests
+import datetime
+import urllib.parse
+import urllib3.exceptions
+from typing import Tuple
+from collections import namedtuple
+
+import http.client
+
+# not sure this will really work. Should go before wayback imports.
+http.client._MAXHEADERS = 1000 # type: ignore
+
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory3
+
+from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
+
+class SandcrawlerBackoffError(Exception):
+ """
+ A set of Exceptions which are raised through multiple abstraction layers to
+ indicate backpressure. For example, SPNv2 back-pressure sometimes needs to
+ be passed up through any timeout/retry code and become an actual long pause
+ or crash.
+ """
+ pass
+
+ResourceResult = namedtuple("ResourceResult", [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+ "revisit_cdx",
+])
+
+WarcResource = namedtuple("WarcResource", [
+ "status_code",
+ "location",
+ "body",
+ "revisit_cdx",
+])
+
+CdxRow = namedtuple('CdxRow', [
+ 'surt',
+ 'datetime',
+ 'url',
+ 'mimetype',
+ 'status_code',
+ 'sha1b32',
+ 'sha1hex',
+ 'warc_csize',
+ 'warc_offset',
+ 'warc_path',
+])
+
+CdxPartial = namedtuple('CdxPartial', [
+ 'surt',
+ 'datetime',
+ 'url',
+ 'mimetype',
+ 'status_code',
+ 'sha1b32',
+ 'sha1hex',
+])
+
+def cdx_partial_from_row(full):
+ return CdxPartial(
+ surt=full.surt,
+ datetime=full.datetime,
+ url=full.url,
+ mimetype=full.mimetype,
+ status_code=full.status_code,
+ sha1b32=full.sha1b32,
+ sha1hex=full.sha1hex,
+ )
+
+def cdx_to_dict(cdx):
+ d = {
+ "surt": cdx.surt,
+ "datetime": cdx.datetime,
+ "url": cdx.url,
+ "mimetype": cdx.mimetype,
+ "status_code": cdx.status_code,
+ "sha1b32": cdx.sha1b32,
+ "sha1hex": cdx.sha1hex,
+ }
+ if type(cdx) == CdxRow and '/' in cdx.warc_path:
+ d['warc_csize'] = cdx.warc_csize
+ d['warc_offset'] = cdx.warc_offset
+ d['warc_path'] = cdx.warc_path
+ return d
+
+def fuzzy_match_url(left, right):
+ """
+ Matches URLs agnostic of http/https (and maybe other normalizations in the
+ future)
+ """
+ if left == right:
+ return True
+ if '://' in left and '://' in right:
+ left = '://'.join(left.split('://')[1:])
+ right = '://'.join(right.split('://')[1:])
+ if left == right:
+ return True
+ if left == right + "/" or right == left + "/":
+ return True
+ return False
+
+def test_fuzzy_match_url():
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+
+ # should probably handle these?
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+
+class CdxApiError(Exception):
+ pass
+
+class CdxApiClient:
+
+ def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
+ self.host_url = host_url
+ self.http_session = requests_retry_session(retries=3, backoff_factor=3)
+ cdx_auth_token = kwargs.get('cdx_auth_token',
+ os.environ.get('CDX_AUTH_TOKEN'))
+ if not cdx_auth_token:
+ raise Exception("CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
+ self.http_session.headers.update({
+ 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
+ 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
+ })
+
+ def _query_api(self, params):
+ """
+ Hits CDX API with a query, parses result into a list of CdxRow
+ """
+ resp = self.http_session.get(self.host_url, params=params)
+ if resp.status_code != 200:
+ raise CdxApiError(resp.text)
+ #print(resp.url, file=sys.stderr)
+ if not resp.text:
+ return None
+ rj = resp.json()
+ if len(rj) <= 1:
+ return None
+ rows = []
+ for raw in rj[1:]:
+ # check number of CDX fields; there is a bug with some rows having
+ # spaces in WARC filename resulting in extra bogus fields
+ if len(raw) != 11:
+ raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected")
+
+ # transform "-" ftp status code to a 226
+ status_code = None
+ if raw[4] == "-":
+ if raw[3] != "warc/revisit" and raw[2].startswith("ftp://"):
+ status_code = 226
+ else:
+ status_code = int(raw[4])
+
+ # CDX rows with no WARC records?
+ if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+ continue
+
+ row = CdxRow(
+ surt=raw[0],
+ datetime=raw[1],
+ url=raw[2],
+ mimetype=raw[3],
+ status_code=status_code,
+ sha1b32=raw[5],
+ sha1hex=b32_hex(raw[5]),
+ warc_csize=int(raw[8]),
+ warc_offset=int(raw[9]),
+ warc_path=raw[10],
+ )
+ assert (row.mimetype == "-") or ("-" not in row)
+ rows.append(row)
+ return rows
+
+ def fetch(self, url, datetime, filter_status_code=None, retry_sleep=None):
+ """
+ Fetches a single CDX row by url/datetime. Raises a KeyError if not
+ found, because we expect to be looking up a specific full record.
+ """
+ if len(datetime) != 14:
+ raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
+ params = {
+ 'url': url,
+ 'from': datetime,
+ 'to': datetime,
+ 'matchType': 'exact',
+ 'limit': 1,
+ 'output': 'json',
+ }
+ if filter_status_code:
+ params['filter'] = "statuscode:{}".format(filter_status_code)
+ resp = self._query_api(params)
+ if not resp:
+ if retry_sleep and retry_sleep > 0:
+ next_sleep = None
+ if retry_sleep > 3:
+ next_sleep = retry_sleep - 3
+ retry_sleep = 3
+ print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ time.sleep(retry_sleep)
+ return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep)
+ raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
+ row = resp[0]
+ # allow fuzzy http/https match
+ if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
+ if retry_sleep and retry_sleep > 0:
+ print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ time.sleep(retry_sleep)
+ return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
+ raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))
+ if filter_status_code:
+ assert row.status_code == filter_status_code
+ return row
+
+ def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None):
+ """
+ Fetches multiple CDX rows for the given URL, tries to find the most recent.
+
+ If no matching row is found, return None. Note this is different from fetch.
+
+ Preference order by status code looks like:
+
+ 200 or 226
+ mimetype match
+ not-liveweb
+ most-recent
+ no match
+ not-liveweb
+ most-recent
+ 3xx
+ most-recent
+ 4xx
+ most-recent
+ 5xx
+ most-recent
+
+ """
+ params = {
+ 'url': url,
+ 'matchType': 'exact',
+ 'limit': -25,
+ 'output': 'json',
+ # Collapsing seems efficient, but is complex; would need to include
+ # other filters and status code in filter
+ #'collapse': 'timestamp:6',
+
+ # Revisits now allowed and resolved!
+ #'filter': '!mimetype:warc/revisit',
+ }
+ if max_age_days:
+ since = datetime.date.today() - datetime.timedelta(days=max_age_days)
+ params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
+ if closest:
+ params['closest'] = closest
+ params['sort'] = "closest"
+ #print(params, file=sys.stderr)
+ rows = self._query_api(params)
+ if not rows:
+ return None
+
+ def _cdx_sort_key(r):
+ """
+ This is a function, not a lambda, because it captures
+ best_mimetype. Will create a tuple that can be used to sort in
+ *reverse* order.
+ """
+ return (
+ int(r.status_code in (200, 226)),
+ int(0 - (r.status_code or 999)),
+ int(r.mimetype == best_mimetype),
+ int(r.mimetype != "warc/revisit"),
+ int(r.datetime[:6]),
+ int('/' in r.warc_path),
+ int(r.datetime),
+ )
+
+ rows = sorted(rows, key=_cdx_sort_key)
+ return rows[-1]
+
+
+class WaybackError(Exception):
+ pass
+
+class WaybackContentError(Exception):
+ pass
+
+class PetaboxError(Exception):
+ pass
+
+class NoCaptureError(Exception):
+ pass
+
+class WaybackClient:
+
+ def __init__(self, cdx_client=None, **kwargs):
+ if cdx_client:
+ self.cdx_client = cdx_client
+ else:
+ self.cdx_client = CdxApiClient()
+ # /serve/ instead of /download/ doesn't record view count
+ # this *does* want to be http://, not https://
+ self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+ self.petabox_webdata_secret = kwargs.get(
+ 'petabox_webdata_secret',
+ os.environ.get('PETABOX_WEBDATA_SECRET'),
+ )
+ self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.rstore = None
+ self.max_redirects = 25
+ self.wayback_endpoint = "https://web.archive.org/web/"
+ self.replay_headers = {
+ 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ }
+
+ def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True):
+ """
+ Fetches wayback resource directly from petabox using WARC path/offset/csize.
+
+ If there is a problem with petabox, raises a PetaboxError.
+ If resource doesn't exist, would raise a KeyError (TODO).
+
+ The body is only returned if the record is success (HTTP 200 or
+ equivalent). Otherwise only the status and header info is returned.
+
+ WarcResource object (namedtuple) contains fields:
+ - status_code: int
+ - location: eg, for redirects
+ - body: raw bytes
+
+ resolve_revist does what it sounds like: tries following a revisit
+ record by looking up CDX API and then another fetch. Refuses to recurse
+ more than one hop (eg, won't follow a chain of revisits).
+
+ Requires (and uses) a secret token.
+ """
+ if not self.petabox_webdata_secret:
+ raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
+ if not "/" in warc_path:
+ raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
+ warc_uri = self.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3(
+ webdata_secret=self.petabox_webdata_secret,
+ ))
+ try:
+ #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
+ gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
+ except wayback.exception.ResourceUnavailable:
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except wayback.exception.InvalidResource:
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ raise WaybackContentError("failed to load file contents from wayback/petabox (InvalidResource)")
+ except urllib3.exceptions.ReadTimeoutError as rte:
+ raise PetaboxError("failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(rte))
+ except ValueError as ve:
+ raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ except Exception as e:
+ if "while decompressing data: invalid block type" in str(e):
+ raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+ else:
+ raise e
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ try:
+ status_code = gwb_record.get_status()[0]
+ except http.client.HTTPException:
+ raise WaybackContentError("too many HTTP headers (in wayback fetch)")
+ location = gwb_record.get_location() or None
+
+ if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit():
+ # TODO: some additional verification here?
+ status_code = 226
+
+ body = None
+ revisit_cdx = None
+ if gwb_record.is_revisit():
+ if not resolve_revisit:
+ raise WaybackContentError("found revisit record, but won't resolve (loop?)")
+ revisit_uri, revisit_dt = gwb_record.refers_to
+ if not (revisit_uri and revisit_dt):
+ raise WaybackContentError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ warc_path, offset))
+ # convert revisit_dt
+ # len("2018-07-24T11:56:49"), or with "Z"
+ assert len(revisit_dt) in (19, 20)
+ if type(revisit_uri) is bytes:
+ revisit_uri = revisit_uri.decode('utf-8')
+ if type(revisit_dt) is bytes:
+ revisit_dt = revisit_dt.decode('utf-8')
+ revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ assert len(revisit_dt) == 14
+ try:
+ revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
+ body = self.fetch_petabox_body(
+ csize=revisit_cdx.warc_csize,
+ offset=revisit_cdx.warc_offset,
+ warc_path=revisit_cdx.warc_path,
+ resolve_revisit=False,
+ expected_status_code=revisit_cdx.status_code,
+ )
+ except KeyError as ke:
+ raise WaybackError("Revist resolution failed: {}".format(ke))
+ elif status_code in (200, 226):
+ try:
+ body = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ raise WaybackError(
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ elif status_code is None:
+ raise WaybackContentError(
+ "got a None status_code in (W)ARC record")
+ return WarcResource(
+ status_code=status_code,
+ location=location,
+ body=body,
+ revisit_cdx=revisit_cdx,
+ )
+
+ def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None):
+ """
+ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
+
+ Returns bytes. Raises KeyError if resource wasn't an HTTP 200.
+
+ Thin helper around fetch_petabox()
+ """
+ resource = self.fetch_petabox(
+ csize=csize,
+ offset=offset,
+ warc_path=warc_path,
+ resolve_revisit=resolve_revisit,
+ )
+
+ if expected_status_code:
+ if expected_status_code != resource.status_code:
+ raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
+ )
+ )
+ elif resource.status_code not in (200, 226):
+ raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
+ resource.status_code)
+ )
+
+ return resource.body
+
+ def fetch_replay_body(self, url, datetime, cdx_sha1hex=None):
+ """
+ Fetches an HTTP 200 record from wayback via the replay interface
+ (web.archive.org) instead of petabox.
+
+ Intended for use with SPN2 requests, where request body has not ended
+ up in petabox yet.
+
+ If cdx_sha1hex is passed, will try to verify fetched body. Note that
+ this check *won't work* in many cases, due to CDX hash being of
+ compressed transfer data, not the uncompressed final content bytes.
+
+ TODO: could instead try to verify that we got the expected replay body
+ using... new X-Archive headers?
+ """
+
+ # defensively check datetime format
+ assert len(datetime) == 14
+ assert datetime.isdigit()
+
+ try:
+ resp = requests.get(
+ self.wayback_endpoint + datetime + "id_/" + url,
+ allow_redirects=False,
+ headers=self.replay_headers,
+ )
+ except requests.exceptions.TooManyRedirects:
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ChunkedEncodingError:
+ raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
+ except UnicodeDecodeError:
+ raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
+
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ #print(resp.url, file=sys.stderr)
+
+ # defensively check that this is actually correct replay based on headers
+ if not "X-Archive-Src" in resp.headers:
+ raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
+ if not datetime in resp.url:
+ raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+
+ if cdx_sha1hex:
+ # verify that body matches CDX hash
+ # TODO: don't need *all* these hashes, just sha1
+ file_meta = gen_file_metadata(resp.content)
+ if cdx_sha1hex != file_meta['sha1hex']:
+ print(" REPLAY MISMATCH: cdx:{} replay:{}".format(
+ cdx_sha1hex,
+ file_meta['sha1hex']),
+ file=sys.stderr)
+ raise WaybackContentError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+ cdx_sha1hex,
+ file_meta['sha1hex']),
+ )
+ return resp.content
+
+ def fetch_replay_redirect(self, url, datetime):
+ """
+ Fetches an HTTP 3xx redirect Location from wayback via the replay interface
+ (web.archive.org) instead of petabox.
+
+ Intended for use with SPN2 requests, where request body has not ended
+ up in petabox yet. For example, re-ingesting a base_url which was
+ recently crawler by SPNv2, where we are doing ingest via wayback path.
+
+ Returns None if response is found, but couldn't find redirect.
+ """
+
+ # defensively check datetime format
+ assert len(datetime) == 14
+ assert datetime.isdigit()
+
+ try:
+ resp = requests.get(
+ self.wayback_endpoint + datetime + "id_/" + url,
+ allow_redirects=False,
+ headers=self.replay_headers,
+ )
+ except requests.exceptions.TooManyRedirects:
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except UnicodeDecodeError:
+ raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ #print(resp.url, file=sys.stderr)
+
+ # defensively check that this is actually correct replay based on headers
+ # previously check for "X-Archive-Redirect-Reason" here
+ if not "X-Archive-Src" in resp.headers:
+ raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
+ if not datetime in resp.url:
+ raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+
+ redirect_url = resp.headers.get("Location")
+ # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
+ #print(redirect_url, file=sys.stderr)
+ if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
+ redirect_url = "/".join(redirect_url.split("/")[5:])
+ #print(redirect_url, file=sys.stderr)
+ if redirect_url and redirect_url.startswith("http"):
+ redirect_url = clean_url(redirect_url)
+ return redirect_url
+ else:
+ return None
+
+ def lookup_resource(self, start_url, best_mimetype=None, closest=None):
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. Returns a ResourceResult object, which may indicate a
+ failure to fetch the resource.
+
+ Only raises exceptions on remote service failure or unexpected
+ problems.
+
+ In a for loop:
+
+ lookup "best" CDX
+ redirect status code?
+ fetch wayback
+ continue
+ success (200)?
+ fetch wayback
+ return success
+ bad (other status)?
+ return failure
+
+ got to end?
+ return failure; too many redirects
+ """
+ next_url = start_url
+ urls_seen = [start_url]
+ for i in range(self.max_redirects):
+ print(" URL: {}".format(next_url), file=sys.stderr)
+ cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest)
+ #print(cdx_row, file=sys.stderr)
+ if not cdx_row:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="no-capture",
+ terminal_url=next_url,
+ terminal_dt=None,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ # first try straight-forward redirect situation
+ if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path:
+ resource = self.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ )
+ if resource.revisit_cdx and resource.revisit_cdx.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=resource.revisit_cdx.status_code,
+ body=resource.body,
+ cdx=cdx_row,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ # else, continue processing with revisit record
+
+ if cdx_row.status_code in (200, 226):
+ revisit_cdx = None
+ if '/' in cdx_row.warc_path:
+ resource = self.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ )
+ body = resource.body
+ revisit_cdx = resource.revisit_cdx
+ else:
+ body = self.fetch_replay_body(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ cdx_row = cdx_partial_from_row(cdx_row)
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
+ elif 300 <= (cdx_row.status_code or 0) < 400:
+ if '/' in cdx_row.warc_path:
+ resource = self.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ resolve_revisit=False,
+ )
+ assert 300 <= resource.status_code < 400
+ if not resource.location:
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="bad-redirect",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ if not "://" in resource.location:
+ next_url = urllib.parse.urljoin(next_url, resource.location)
+ else:
+ next_url = resource.location
+ if next_url:
+ next_url = clean_url(next_url)
+ else:
+ next_url = self.fetch_replay_redirect(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ if next_url:
+ next_url = clean_url(next_url)
+ cdx_row = cdx_partial_from_row(cdx_row)
+ if not next_url:
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="bad-redirect",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ if next_url in urls_seen:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="redirect-loop",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ urls_seen.append(next_url)
+ continue
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="redirects-exceeded",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+
+
+class SavePageNowError(Exception):
+ pass
+
+class SavePageNowBackoffError(SandcrawlerBackoffError):
+ pass
+
+SavePageNowResult = namedtuple('SavePageNowResult', [
+ 'success',
+ 'status',
+ 'job_id',
+ 'request_url',
+ 'terminal_url',
+ 'terminal_dt',
+ 'resources',
+])
+
+class SavePageNowClient:
+
+ def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs):
+ self.ia_access_key = kwargs.get('ia_access_key',
+ os.environ.get('IA_ACCESS_KEY'))
+ self.ia_secret_key = kwargs.get('ia_secret_key',
+ os.environ.get('IA_SECRET_KEY'))
+ self.v2endpoint = v2endpoint
+ self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
+ self.v2_session.headers.update({
+ 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient',
+ 'Accept': 'application/json',
+ 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key),
+ })
+
+ # 3 minutes total
+ self.poll_count = 60
+ self.poll_seconds = 3.0
+
+ self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0)
+
+ def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0):
+ """
+ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
+ at all, or raises an exception if there was an error with SPN itself.
+
+ If SPN2 was unable to fetch the remote content, `success` will be
+ false and status will be indicated.
+
+ SavePageNowResult fields:
+ - success: boolean if SPN
+ - status: "success" or an error message/type
+ - job_id: returned by API
+ - request_url: url we asked to fetch
+ - terminal_url: final primary resource (after any redirects)
+ - terminal_dt: wayback timestamp of final capture
+ - resources: list of all URLs captured
+
+ TODO: parse SPN error codes (status string) and handle better. Eg,
+ non-200 remote statuses, invalid hosts/URLs, timeouts, backoff, etc.
+ """
+ if capture_outlinks:
+ print(" capturing outlinks!", file=sys.stderr)
+ if not (self.ia_access_key and self.ia_secret_key):
+ raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
+ if request_url.startswith("ftp://"):
+ return SavePageNowResult(
+ False,
+ "spn2-no-ftp",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data={
+ 'url': request_url,
+ 'capture_all': 1,
+ 'capture_outlinks': capture_outlinks,
+ 'capture_screenshot': 0,
+ 'if_not_archived_within': '1d',
+ 'force_get': force_simple_get,
+ 'skip_first_archive': 1,
+ 'outlinks_availability': 0,
+ 'js_behavior_timeout': 0,
+ },
+ )
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError("status_code: {}, url: {}".format(resp.status_code, request_url))
+ elif resp.status_code != 200:
+ raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url))
+ resp_json = resp.json()
+
+ if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json['message']:
+ raise SavePageNowBackoffError(resp_json['message'])
+ elif not resp_json or 'job_id' not in resp_json:
+ raise SavePageNowError(
+ "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json))
+
+ job_id = resp_json['job_id']
+ print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+
+ # poll until complete
+ final_json = None
+ for i in range(self.poll_count):
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id']))
+ try:
+ resp.raise_for_status()
+ except:
+ raise SavePageNowError(resp.content)
+ status = resp.json()['status']
+ if status == 'pending':
+ time.sleep(self.poll_seconds)
+ elif status in ('success', 'error'):
+ final_json = resp.json()
+ break
+ else:
+ raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url))
+
+ if not final_json:
+ raise SavePageNowError("SPN2 timed out (polling count exceeded)")
+
+ # if there was a recent crawl of same URL, fetch the status of that
+ # crawl to get correct datetime
+ if final_json.get('original_job_id'):
+ print(f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", file=sys.stderr)
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))
+ try:
+ resp.raise_for_status()
+ except:
+ raise SavePageNowError(resp.content)
+ final_json = resp.json()
+
+ #print(final_json, file=sys.stderr)
+
+ if final_json['status'] == "success":
+ if final_json.get('original_url').startswith('/'):
+ print(f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", file=sys.stderr)
+ return SavePageNowResult(
+ True,
+ "success",
+ job_id,
+ request_url,
+ final_json['original_url'],
+ final_json['timestamp'],
+ final_json['resources'],
+ )
+ else:
+ if final_json['status'] == 'pending':
+ final_json['status'] = 'error:pending'
+ return SavePageNowResult(
+ False,
+ final_json.get('status_ext') or final_json['status'],
+ job_id,
+ request_url,
+ None,
+ None,
+ None,
+ )
+
+ def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
+ """
+ Runs a SPN2 crawl, then fetches body.
+
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
+ """
+
+ # HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
+ if 'gzbd.cnki.net/' in start_url:
+ spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get, capture_outlinks=1)
+ else:
+ spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
+
+ if not spn_result.success:
+ status = spn_result.status
+ if status in ("error:invalid-url", "error:not-found",
+ "error:invalid-host-resolution", "error:gateway-timeout",
+ "error:too-many-redirects", "error:read-timeout"):
+ status = status.replace("error:", "")
+ elif status in ("error:no-access", "error:forbidden"):
+ status = "forbidden"
+ elif status == "error:user-session-limit":
+ raise SavePageNowBackoffError("SPNv2 user-session-limit")
+ elif status == "error:internal-server-error":
+ status = "remote-server-error"
+ elif status.startswith("error:"):
+ status = "spn2-" + status
+ # despite other errors, call these a failure (so we don't retry)
+ if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")):
+ status = "blocked-cookie"
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status=status,
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+ #print(spn_result, file=sys.stderr)
+
+ # detect partial URL response (aka, success, but missing full URL)
+ if not "://" in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-success-partial-url",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ # don't try to CDX fetch for this common cookie block terminal
+ if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="blocked-cookie",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ cdx_row = None
+ # hack to work around elsevier weirdness
+ if "://pdf.sciencedirectassets.com/" in spn_result.request_url:
+ elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
+ spn_result.request_url,
+ best_mimetype="application/pdf",
+ )
+ if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf":
+ print(" Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ cdx_row = elsevier_pdf_cdx
+ else:
+ print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ #print(elsevier_pdf_cdx, file=sys.stderr)
+
+ if not cdx_row:
+ # lookup exact
+ try:
+ filter_status_code = None
+ if spn_result.terminal_url.startswith("ftp://"):
+ filter_status_code = 226
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=filter_status_code,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
+ # sometimes there are fuzzy http/https self-redirects with the
+ # same SURT; try to work around that
+ if cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
+ except KeyError as ke:
+ print(" CDX KeyError: {}".format(ke), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-cdx-lookup-failure",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ #print(cdx_row, file=sys.stderr)
+
+ revisit_cdx = None
+ if '/' in cdx_row.warc_path:
+ # Usually can't do this kind of direct fetch because CDX result is recent/live
+ resource = wayback_client.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ )
+ body = resource.body
+ if resource.revisit_cdx:
+ assert resource.revisit_cdx.sha1hex == cdx_row.sha1hex
+ revisit_cdx = resource.revisit_cdx
+ else:
+ # note: currently not trying to verify cdx_row.sha1hex
+ try:
+ body = wayback_client.fetch_replay_body(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ except (WaybackError, WaybackContentError) as we:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-wayback-error",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+ # warc_path etc will change, so strip them out
+ cdx_row = cdx_partial_from_row(cdx_row)
+
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
+ )
+
+
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
+ if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+ print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+ inner_body = gzip.decompress(resource.body)
+ if not inner_body:
+ raise Exception("null body inside transfer encoding")
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
new file mode 100644
index 0000000..b852c69
--- /dev/null
+++ b/python/sandcrawler/ingest.py
@@ -0,0 +1,833 @@
+
+import sys
+import json
+import gzip
+import time
+import base64
+import xml.etree.ElementTree
+from collections import namedtuple
+from typing import Optional, Tuple, Any, Dict, List
+from http.server import BaseHTTPRequestHandler, HTTPServer
+
+import requests
+from selectolax.parser import HTMLParser
+
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.pdfextract import process_pdf, PdfExtractResult
+from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_ingest import fetch_html_resources, \
+ quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
+ WebResource, html_guess_platform
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.xml import xml_reserialize
+
+
+MAX_BODY_SIZE_BYTES = 128*1024*1024
+
+class IngestFileWorker(SandcrawlerWorker):
+ """
+ High level flow is to look in history first, then go to live web if
+ resource not found. Following redirects is treated as "fetching a
+ resource". Current version fetches a single resource; if it isn't a hit
+ but is an HTML 200, treats it as a landing page, tries to extract
+ fulltext link, then fetches that resource.
+
+ process(request, key=None) -> response
+ Does all the things!
+
+ Check existing processing (short circuit):
+
+ check_existing_ingest(base_url) -> ingest_file_result or none
+ process_existing(result) -> response
+ try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()
+
+ Fetch resource:
+
+ find_resource(url) -> ResourceResult
+
+ Process resource:
+
+ process_hit(ResourceResult) -> response
+ process_grobid(ResourceResult)
+ """
+
+ def __init__(self, sink=None, **kwargs):
+ super().__init__()
+
+ self.sink = sink
+ self.wayback_client = kwargs.get('wayback_client')
+ if not self.wayback_client:
+ self.wayback_client = WaybackClient()
+ self.spn_client = kwargs.get('spn_client')
+ if not self.spn_client:
+ self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+ self.grobid_client = kwargs.get('grobid_client')
+ if not self.grobid_client:
+ self.grobid_client = GrobidClient()
+ self.pgrest_client = kwargs.get('pgrest_client')
+ if not self.pgrest_client:
+ self.pgrest_client = SandcrawlerPostgrestClient()
+ self.grobid_sink = kwargs.get('grobid_sink')
+ self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.pdftext_sink = kwargs.get('pdftext_sink')
+ self.xmldoc_sink = kwargs.get('xmldoc_sink')
+ self.htmlteixml_sink = kwargs.get('htmlteixml_sink')
+ self.max_hops = 6
+
+ self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
+ self.try_existing_grobid = kwargs.get('try_existing_grobid', True)
+ self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True)
+ self.try_wayback = kwargs.get('try_wayback', True)
+ self.try_spn2 = kwargs.get('try_spn2', True)
+ self.html_quick_mode = kwargs.get('html_quick_mode', False)
+ self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
+
+ self.base_url_blocklist = [
+ # robot blocking
+ "://hkvalidate.perfdrive.com/",
+
+ # temporary, until we implement specific fetch and 'petabox' output
+ "://archive.org/",
+ "://www.archive.org/",
+ "://web.archive.org/web/",
+
+ # out of scope
+ "://openlibrary.org/",
+ "://www.openlibrary.org/",
+ "://fatcat.wiki/",
+ "://orcid.org/",
+ "://doaj.org/",
+
+ # Domain squats
+ "://bartandjones.com",
+ "://ijretm.com",
+ "://ijrcemas.com",
+ "://jist.net.in",
+ "://croisements-revue.org",
+
+ # all stubs/previews, not full papers
+ "://page-one.live.cf.public.springer.com",
+
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+ "://doi.org/10.25642/ipk/gbis/",
+ "://apex.ipk-gatersleben.de/",
+ "fao.org/glis/",
+
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
+
+ # DOI prefixes
+ "doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.18730/", # fao.org: database entry
+ "doi.org/10.15468/", # gbif.org: database entry
+
+ # deprecated domain (doesn't redirect correctly)
+ "://edoc.mpg.de/",
+ ]
+
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
+ "/password-login",
+ "://gateway.isiknowledge.com/",
+ "/login?TARGET=",
+ ]
+
+ self.cookie_blocklist = [
+ "/cookieAbsent",
+ "cookieSet=1",
+ "error=cookies_not_supported",
+ ]
+
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.spn2_simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ self.src_valid_mimetypes = [
+ "text/x-tex",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip",
+ "application/x-tar",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ]
+
+ self.component_valid_mimetypes = [
+ "image/jpeg",
+ "image/tiff",
+ "image/png",
+ "image/gif",
+ "audio/mpeg",
+ "video/mp4",
+ "video/mpeg",
+ "text/plain",
+ "text/csv",
+ "application/json",
+ "application/xml",
+ "application/pdf",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip ",
+ "application/x-rar ",
+ "application/x-7z-compressed",
+ "application/x-tar",
+ "application/vnd.ms-powerpoint",
+ "application/vnd.ms-excel",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ]
+
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Check in sandcrawler-db (postgres) to see if we have already ingested
+ this URL (ingest file result table).
+
+ Returns existing row *if* found *and* we should use it, otherwise None.
+
+ Looks at existing ingest results and makes a decision based on, eg,
+ status and timestamp.
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing['hit'] == True:
+ return existing
+ else:
+ return None
+
+ def find_resource(self, url, best_mimetype=None, force_recrawl=False) -> Optional[ResourceResult]:
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. If a hit isn't found, try crawling with SPN.
+ """
+ via = "none"
+ resource = None
+
+ if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"):
+ raise NotImplementedError("handling direct wayback links not supported yet")
+
+ if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
+ raise NotImplementedError("fetching from archive.org not implemented yet")
+
+ if self.try_wayback and not force_recrawl:
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(url, best_mimetype)
+
+ # check for "soft 404" conditions, where we should retry with live SPNv2
+ soft404 = False
+ # NOTE: these are often not working with SPNv2 either, so disabling. If
+ # we really want to try again, should do force-recrawl
+ #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
+ # soft404 = True
+
+ old_failure = False
+ if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
+ old_failure = True
+
+ if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
+ via = "spn2"
+ force_simple_get = 0
+ for domain in self.spn2_simple_get_domains:
+ if domain in url:
+ force_simple_get = 1
+ break
+ resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
+ print("[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or url),
+ file=sys.stderr)
+ return resource
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest file result, do any database fetches or
+ additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+ assert result_row['hit']
+ existing_file_meta = self.pgrest_client.get_file_meta(result_row['terminal_sha1hex'])
+ existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
+ existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
+ if not (existing_file_meta and existing_grobid and existing_cdx):
+ raise NotImplementedError("partially-exsiting records not implemented yet")
+ result = {
+ 'hit': result_row['hit'],
+ 'status': "existing",
+ 'request': request,
+ 'grobid': existing_grobid,
+ 'file_meta': existing_file_meta,
+ 'cdx': existing_cdx,
+ 'terminal': {
+ 'terminal_url': result_row['terminal_url'],
+ 'terminal_dt': result_row['terminal_dt'],
+ 'terminal_status_code': result_row['terminal_status_code'],
+ 'terminal_sha1hex': result_row['terminal_sha1hex'],
+ },
+ }
+ return result
+
+ def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Run all the necessary processing for a new/fresh ingest hit.
+ """
+ if ingest_type == "pdf":
+ return {
+ 'grobid': self.process_grobid(resource, file_meta),
+ 'pdf_meta': self.process_pdfextract(resource, file_meta),
+ }
+ elif ingest_type == "xml":
+ return {
+ 'xml_meta': self.process_xml(resource, file_meta),
+ }
+ elif ingest_type == "html":
+ html_info = self.process_html(resource, file_meta)
+ # if there is no html_biblio, don't clobber anything possibly extracted earlier
+ if 'html_biblio' in html_info and not html_info['html_biblio']:
+ html_info.pop('html_biblio')
+ return html_info
+ elif ingest_type == "src":
+ return {}
+ elif ingest_type == "component":
+ return {}
+ else:
+ raise NotImplementedError(f"process {ingest_type} hit")
+
+ def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Submits to resource body to GROBID for processing.
+
+ TODO: By default checks sandcrawler-db for an existing row first, then
+ decide if we should re-process
+ """
+ if self.try_existing_grobid:
+ existing = self.pgrest_client.get_grobid(file_meta['sha1hex'])
+ if existing:
+ print("found existing GROBID result", file=sys.stderr)
+ return existing
+
+ # Need to actually processes
+ result = self.grobid_client.process_fulltext(resource.body)
+ if self.grobid_sink:
+ # extra fields for GROBID kafka messages
+ result['file_meta'] = file_meta
+ result['key'] = result['file_meta']['sha1hex']
+ self.grobid_sink.push_record(result.copy())
+ if result['status'] == "success":
+ metadata = self.grobid_client.metadata(result)
+ if metadata:
+ result['metadata'] = self.grobid_client.metadata(result)
+ result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
+ result['grobid_version'] = result['metadata'].pop('grobid_version', None)
+ result.pop('tei_xml', None)
+ result.pop('file_meta', None)
+ result.pop('key', None)
+ return result
+
+ def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Extracts thumbnail and pdf_meta info from PDF.
+
+ By default checks sandcrawler-db for an existing row first, then decide
+ if we should re-process.
+
+ TODO: difference between Kafka schema and SQL/postgrest schema
+ """
+ if self.try_existing_pdfextract:
+ existing = self.pgrest_client.get_pdf_meta(file_meta['sha1hex'])
+ if existing:
+ print("found existing pdf_meta result", file=sys.stderr)
+ result = PdfExtractResult.from_pdf_meta_dict(existing)
+ return result.to_pdftext_dict()
+
+ # Need to actually processes
+ result = process_pdf(resource.body)
+ assert result.file_meta['sha1hex'] == file_meta['sha1hex']
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ if self.pdftext_sink:
+ self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
+ result.page0_thumbnail = None
+ result.text = None
+ result.file_meta = None
+ return result.to_pdftext_dict()
+
+ def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Simply publishes to Kafka topic.
+
+ In the future, could extract other metadata here (like body word
+ count), or attempting to fetch sub-resources.
+ """
+ if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml":
+ try:
+ jats_xml = xml_reserialize(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="xml-parse-error")
+ msg = dict(
+ sha1hex=file_meta["sha1hex"],
+ status="success",
+ jats_xml=jats_xml,
+ )
+ self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex'])
+ return dict(status="success")
+
+ def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
+
+ assert resource.body
+ try:
+ html_doc = HTMLParser(resource.body)
+ except ValueError as ve:
+ return dict(
+ status="html-selectolax-error",
+ )
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
+ html_body = html_extract_body_teixml(resource.body)
+ html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
+ html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
+
+ if html_scope in ('blocked-captcha','blocked-cookie','blocked-forbidden'):
+ return dict(
+ status=html_scope,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ )
+ elif html_scope not in ('article-fulltext','unknown',):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="wrong-scope",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules)
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ if self.htmlteixml_sink and html_body['status'] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])
+
+ html_body.pop("tei_xml", None)
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ when = parse_cdx_datetime(resource.cdx.datetime)
+ full_resources: List[WebResource] = []
+
+ try:
+ if self.html_quick_mode:
+ print(" WARN: running quick CDX-only fetches", file=sys.stderr)
+ full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result['status'] = 'petabox-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result['status'] = 'cdx-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result['status'] = 'wayback-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result['status'] = 'wayback-content-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except NoCaptureError as e:
+ partial_result['status'] = 'html-resource-no-capture'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+
+ info = dict(
+ html_body=html_body,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
+ )
+ if html_scope == 'unknown':
+ info['status'] = 'unknown-scope'
+ return info
+
+ def timeout_response(self, task: dict) -> dict:
+ print("[TIMEOUT]", file=sys.stderr)
+ return dict(
+ request=task,
+ hit=False,
+ status="timeout",
+ error_message="ingest worker internal timeout",
+ )
+
+ def want(self, request: dict) -> bool:
+ if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html', 'src', 'component'):
+ return False
+ return True
+
+ def process(self, request: dict, key: Any = None) -> dict:
+
+ # old backwards compatibility
+ if request.get('ingest_type') == 'file':
+ request['ingest_type'] = 'pdf'
+
+ ingest_type = request.get('ingest_type')
+ if ingest_type not in ("pdf", "xml", "html", "src", "component"):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request['base_url'])
+
+ force_recrawl = bool(request.get('force_recrawl', False))
+
+ for block in self.base_url_blocklist:
+ if block in base_url:
+ print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+ return dict(request=request, hit=False, status="skip-url-blocklist")
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ best_mimetype = None
+ if ingest_type == "pdf":
+ best_mimetype = "application/pdf"
+ elif ingest_type == "xml":
+ best_mimetype = "text/xml"
+ elif ingest_type == "html":
+ best_mimetype = "text/html"
+ elif ingest_type == "src":
+ best_mimetype = "application/gzip"
+
+ existing = self.check_existing_ingest(ingest_type, base_url)
+ if existing:
+ return self.process_existing(request, existing)
+
+ result: Dict[str, Any] = dict(request=request, hit=False)
+
+ next_url = base_url
+ hops = [base_url]
+
+ while len(hops) <= self.max_hops:
+
+ result['hops'] = hops
+
+ # check against blocklist again on each hop
+ for block in self.base_url_blocklist:
+ if block in next_url:
+ result['status'] = "skip-url-blocklist"
+ return result
+
+ # check against known loginwall URLs
+ for block in self.wall_blocklist:
+ if block in next_url:
+ # TODO: blocked-wall instead of skip-wall
+ result['status'] = "skip-wall"
+ return result
+
+ # check for popular cookie blocking URL patterns. On successful SPN
+ # crawls, shouldn't see these redirect URLs
+ for pattern in self.cookie_blocklist:
+ if pattern in next_url:
+ result['status'] = 'blocked-cookie'
+ return result
+
+ try:
+ resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
+ except SavePageNowError as e:
+ result['status'] = 'spn2-error'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result['status'] = 'petabox-error'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result['status'] = 'cdx-error'
+ result['error_message'] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result['status'] = 'wayback-error'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result['status'] = 'wayback-content-error'
+ result['error_message'] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result['status'] = 'not-implemented'
+ result['error_message'] = str(e)[:1600]
+ return result
+
+ assert resource
+
+ if resource.terminal_url:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result['hops']:
+ result['hops'].append(resource.terminal_url)
+
+ if not resource.hit:
+ result['status'] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result['status'] = 'skip-url-blocklist'
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result['status'] = 'blocked-cookie'
+ return result
+
+ if not resource.body:
+ result['status'] = 'null-body'
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result['status'] = 'body-too-large'
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result['status'] = 'bad-gzip-encoding'
+ result['error_message'] = str(e)
+ return result
+
+ if not resource.body or file_meta['size_bytes'] == 0:
+ result['status'] = 'null-body'
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta['mimetype']
+ or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta['mimetype']
+ or "text/xml" in file_meta['mimetype']
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if not 'html_biblio' in result or html_biblio.title:
+ result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+ #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ if ingest_type == "pdf" and html_ish_resource:
+
+ # the new style of URL extraction (already computed)
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
+
+ result['extract_next_hop'] = fulltext_url
+ if not fulltext_url:
+ result['status'] = 'no-pdf-link'
+ return result
+ next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') or ""
+ assert next_url
+ next_url = clean_url(next_url)
+ print("[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ fulltext_url.get('technique'),
+ next_url,
+ ),
+ file=sys.stderr)
+ if next_url in hops:
+ result['status'] = 'link-loop'
+ result['error_message'] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif ingest_type in ("xml", "html", "component") and html_ish_resource and html_biblio:
+ # NOTE: src_fulltext_url is not a thing
+ next_url_found = None
+ if ingest_type == "xml" and html_biblio.xml_fulltext_url:
+ next_url_found = html_biblio.xml_fulltext_url
+ elif ingest_type == "html" and html_biblio.html_fulltext_url:
+ next_url_found = html_biblio.html_fulltext_url
+ elif ingest_type == "component" and html_biblio.component_url:
+ next_url_found = html_biblio.component_url
+
+ if next_url_found:
+ next_url = next_url_found
+ technique = "html_biblio"
+ print("[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr)
+ if next_url in hops:
+ if ingest_type == "html":
+ # for HTML ingest, we don't count this as a link-loop
+ break
+ result['status'] = 'link-loop'
+ result['error_message'] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result['status'] = "max-hops-exceeded"
+ return result
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit == True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta['sha1hex'],
+ }
+
+ result['file_meta'] = file_meta
+ result['cdx'] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
+
+ if ingest_type == "pdf":
+ if file_meta['mimetype'] != "application/pdf":
+ result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
+ result['status'] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
+ result['status'] = "wrong-mimetype"
+ return result
+ elif ingest_type == "src":
+ if file_meta['mimetype'] not in self.src_valid_mimetypes:
+ result['status'] = "wrong-mimetype"
+ return result
+ elif ingest_type == "component":
+ if file_meta['mimetype'] not in self.component_valid_mimetypes:
+ result['status'] = "wrong-mimetype"
+ return result
+ else:
+ raise NotImplementedError()
+
+ info = self.process_hit(ingest_type, resource, file_meta)
+ result.update(info)
+
+ # check if processing turned up an error
+ if info.get('status') not in ('success', None):
+ result['status'] = info['status']
+ return result
+
+ result['status'] = "success"
+ result['hit'] = True
+ if ingest_type == "pdf":
+ print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
+ ingest_type,
+ result.get('file_meta', {}).get('sha1hex'),
+ result.get('grobid', {}).get('status_code'),
+ result.get('pdf_meta', {}).get('status'),
+ ),
+ file=sys.stderr)
+ else:
+ print("[SUCCESS {:>5}] sha1:{}".format(
+ ingest_type,
+ result.get('file_meta', {}).get('sha1hex'),
+ ),
+ file=sys.stderr)
+ return result
+
+
+class IngestFileRequestHandler(BaseHTTPRequestHandler):
+ def do_POST(self):
+ if self.path != "/ingest":
+ self.send_response(404)
+ self.end_headers()
+ self.wfile.write("404: Not Found")
+ return
+ length = int(self.headers.get('content-length'))
+ request = json.loads(self.rfile.read(length).decode('utf-8'))
+ print("Got request: {}".format(request))
+ ingester = IngestFileWorker()
+ result = ingester.process(request)
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(json.dumps(result))
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
new file mode 100644
index 0000000..c7deea1
--- /dev/null
+++ b/python/sandcrawler/minio.py
@@ -0,0 +1,99 @@
+
+import io
+import os
+import hashlib
+
+import minio
+
+
+class SandcrawlerMinioClient(object):
+
+ def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+ """
+ host is minio connection string (host:port)
+ access and secret key are as expected
+ default_bucket can be supplied so that it doesn't need to be repeated for each function call
+
+ Example config:
+
+ host="localhost:9000",
+ access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ """
+ self.mc = minio.Minio(
+ host_url,
+ access_key=access_key,
+ secret_key=secret_key,
+ secure=False,
+ )
+ self.default_bucket = default_bucket
+
+ def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+ if not extension:
+ extension = ""
+ if not prefix:
+ prefix = ""
+ assert len(sha1hex) == 40
+ obj_path = "{}{}/{}/{}/{}{}".format(
+ prefix,
+ folder,
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ """
+ blob should be bytes
+ sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
+ Uploads blob to path in the given bucket. Files are stored in a top-level
+ folder, then in two levels of sub-directory based on sha1, then the
+ filename is SHA1 with an optional file extension.
+ """
+ if type(blob) == str:
+ blob = blob.encode('utf-8')
+ assert type(blob) == bytes
+ if not sha1hex:
+ h = hashlib.sha1()
+ h.update(blob)
+ sha1hex = h.hexdigest()
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
+ if not bucket:
+ bucket = self.default_bucket
+ assert bucket
+ content_type = "application/octet-stream"
+ if extension.endswith('.xml'):
+ content_type = "application/xml"
+ if extension.endswith('.png'):
+ content_type = "image/png"
+ elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ content_type = "image/jpeg"
+ elif extension.endswith('.txt'):
+ content_type = "text/plain"
+ self.mc.put_object(
+ bucket,
+ obj_path,
+ io.BytesIO(blob),
+ len(blob),
+ content_type=content_type,
+ )
+ return (bucket, obj_path)
+
+ def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ """
+ sha1hex is sha1 of the blob itself
+
+ Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
+ """
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
+ if not bucket:
+ bucket = self.default_bucket
+ assert bucket
+ blob = self.mc.get_object(
+ bucket,
+ obj_path,
+ )
+ # TODO: optionally verify SHA-1?
+ return blob
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
new file mode 100644
index 0000000..a3e2960
--- /dev/null
+++ b/python/sandcrawler/misc.py
@@ -0,0 +1,222 @@
+
+import base64
+import magic
+import hashlib
+import datetime
+from typing import Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
+
+
+def clean_url(s: str) -> str:
+ s = s.strip()
+ parsed = urlcanon.parse_url(s)
+ if not parsed.port and parsed.colon_before_port:
+ parsed.colon_before_port = b''
+ return str(urlcanon.whatwg(parsed))
+
+def url_fuzzy_equal(left: str, right: str) -> bool:
+ """
+ TODO: use proper surt library and canonicalization for this check
+ """
+ fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
+ fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+ if fuzzy_left == fuzzy_right:
+ return True
+ elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
+ return True
+ return False
+
+def test_url_fuzzy_equal() -> None:
+ assert True == url_fuzzy_equal(
+ "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
+
+def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
+ """
+ Takes a file blob (bytestream) and returns hashes and other metadata.
+
+ Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
+ """
+ assert blob is not None
+ if not allow_empty:
+ assert blob
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if mimetype in ("application/xml", "text/xml"):
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ for h in hashes:
+ h.update(blob)
+ return dict(
+ size_bytes=len(blob),
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+def b32_hex(s: str) -> str:
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+NORMAL_MIME = (
+ 'application/pdf',
+ 'application/postscript',
+ 'text/html',
+ 'text/xml',
+ 'application/octet-stream',
+)
+
+def normalize_mime(raw: str) -> Optional[str]:
+ raw = raw.lower().strip()
+ for norm in NORMAL_MIME:
+ if raw.startswith(norm):
+ return norm
+
+ # Special cases
+ if raw.startswith('application/xml'):
+ return 'text/xml'
+ if raw.startswith('application/x-pdf'):
+ return 'application/pdf'
+ if raw in (
+ '.pdf',
+ ):
+ return 'application/pdf'
+ if raw in (
+ 'application/download',
+ 'binary/octet-stream',
+ 'unk',
+ 'application/x-download',
+ 'application/octetstream',
+ 'application/force-download',
+ 'application/unknown',
+ ):
+ return 'application/octet-stream'
+ return None
+
+
+def test_normalize_mime():
+ assert normalize_mime("asdf") is None
+ assert normalize_mime("application/pdf") == "application/pdf"
+ assert normalize_mime("application/pdf+journal") == "application/pdf"
+ assert normalize_mime("Application/PDF") == "application/pdf"
+ assert normalize_mime("application/p") is None
+ assert normalize_mime("application/xml+stuff") == "text/xml"
+ assert normalize_mime("application/x-pdf") == "application/pdf"
+ assert normalize_mime("application/x-html") is None
+ assert normalize_mime("unk") == "application/octet-stream"
+ assert normalize_mime("binary/octet-stream") == "application/octet-stream"
+
+
+def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
+ """
+ This method always filters a few things out:
+
+ - non-HTTP requests, based on lack of status code (eg, whois)
+ """
+
+ cdx = raw_cdx.split()
+ if len(cdx) < 11:
+ return None
+
+ surt = cdx[0]
+ dt = cdx[1]
+ url = cdx[2]
+ mime = normalize_mime(cdx[3])
+ http_status = cdx[4]
+ sha1b32 = cdx[5]
+ c_size = cdx[8]
+ offset = cdx[9]
+ warc = cdx[10]
+
+ if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
+ and len(sha1b32) == 32 and dt.isdigit()):
+ return None
+
+ if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ return None
+
+ if mime is None or mime == '-':
+ mime = "application/octet-stream"
+
+ if normalize:
+ mime = normalize_mime(mime)
+
+ sha1hex = b32_hex(sha1b32)
+
+ return dict(
+ surt=surt,
+ url=url,
+ datetime=dt,
+ mimetype=mime,
+ http_status=int(http_status),
+ sha1b32=sha1b32,
+ sha1hex=sha1hex,
+ warc_csize=int(c_size),
+ warc_offset=int(offset),
+ warc_path=warc,
+ )
+
+def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
+ if not dt_str:
+ return None
+ try:
+ return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+ except Exception:
+ return None
+
+def test_parse_cdx_datetime() -> None:
+ assert parse_cdx_datetime("") == None
+ assert parse_cdx_datetime("asdf") == None
+ assert parse_cdx_datetime("19930203123045") != None
+ assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+
+def datetime_to_cdx(dt: datetime.datetime) -> str:
+ return '%04d%02d%02d%02d%02d%02d' % (
+ dt.year, dt.month, dt.day,
+ dt.hour, dt.minute, dt.second,
+ )
+
+def test_datetime_to_cdx() -> None:
+ assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+
+def requests_retry_session(retries=10, backoff_factor=3,
+ status_forcelist=(500, 502, 504), session=None) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount('http://', adapter)
+ session.mount('https://', adapter)
+ return session
+
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
new file mode 100644
index 0000000..9b4e834
--- /dev/null
+++ b/python/sandcrawler/pdfextract.py
@@ -0,0 +1,470 @@
+
+import sys
+import json
+import datetime
+from io import BytesIO
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+
+import poppler
+from PIL import Image
+
+from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .misc import gen_file_metadata
+
+
+# This is a hack to work around timeouts when processing certain PDFs with
+# poppler. For some reason, the usual Kafka timeout catcher isn't working on
+# these, maybe due to threading.
+BAD_PDF_SHA1HEX = [
+ "011478a1e63a2a31eae1a93832a74cc95f220760",
+ "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
+ "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
+ "06061af0707298c12932516d1bb7c2b6dc443824",
+ "0641822e68c5a07538b967489fd19a1d5dc371a5",
+ "09cba9b00494d12759c50cb914f1fb7c9746f5d1",
+ "09db7c9f2efb496c974427a61e84292ae27fc702",
+ "0a1c13cb8783bbbf248b2345b9890e2410aa3f0a",
+ "0ccc6dc94f4e2d809fac8543870265c3421f3c9e",
+ "0d1c1567ea70e7b922ba88ccb868ffc7ca18e75c",
+ "10c6577a658bf6203557e2998b25ea9788f8adfe",
+ "15a720921ce30da983fcd1bfa7fe9aeeda503e41",
+ "1659881a31edc2d0e170f6bb26d32e74cc4ca387",
+ "17e679b0ec9444fff2ea4d02caec05dd2de80ec3",
+ "182749ad1db1d5e999d07f010bdcfc2978dadc88",
+ "1a17a4fc43397804830cc29021281aac2e8cf0cb",
+ "1cb166f0c0b5ffe673e6bbf6a29d77278711f253",
+ "1d04e46b6848e6479dd90fe26bb11627044fb664",
+ "1d967c95546d31edaaf0c3ef9ffcc11113a9e11a",
+ "1f90194bf0c7fff1fe1ed5fff77a934c7a1b32a0",
+ "20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
+ "2195e528fa1cf5f8ae3b2adcc516896016c3411f",
+ "25ab9e6169f041be05844a9b4edd6574918af769",
+ "281de904c4642a9be4f17b9774fc0a2bdc8a90e3",
+ "2bd5322975653536550a039eb055174b2bf241b3",
+ "2fc64da736175810918fd32c94c5068b0d660bcc",
+ "32318fba9b05b2756b7362bcaa4722c92ed8d449",
+ "336833c6fc968cd0938250dfc93c032a30111cfc",
+ "362ad00bc24d650c8f11851f9e554fc560b73e7a",
+ "373f84dfab4ed47047826e604e2918a9cd6a95b2",
+ "3ac0b6e17e30d141871a0a5b127536919fe5aa19",
+ "3c8a6a708da0dc1802f5f3e5267a49b3c25e1ffe",
+ "3e5f9fb94e7314447a22f3d009419a922136177f",
+ "3fad493c940137ce703f2f570ebb504e360c6df3",
+ "40aa94602ab13e5a7d9df8c989fca4fa5c01239e",
+ "427479c94d7d0e512f898bc7ff0b6f210069f902",
+ "436c9183724f051b22c96285aa8ff1d2ba709574",
+ "43a8c0abf0386d3e3397cf5e22a884761dd63db7",
+ "445968ef735b228c08c3ff4238d99fc9f4824619",
+ "447fa6b5a90742a86429a932f6608d8e141688c0",
+ "45f014d7d631559dc7726e5c5513f1e7c91c48a9",
+ "47577ff6d6876117ca69bec60a5764f7d2c2ec70",
+ "4785181cec8944eee00ddb631a5dfc771b89bab7",
+ "47db2db2cc976429568841a0496c0ab4ed7b5977",
+ "481c0bae81873988fcc8662ba8a269e8823fdea2",
+ "4c81129904f7976a50825595a3497ea7b52579ef",
+ "4edc1402712fa6827c4501fed8042e9f4447829c",
+ "50b3c5a3122272aca69855ef06b85d0b43a76eb1",
+ "52fc9b3c5199ef395d410c7cee5961dc812e4d29",
+ "53471346019947a88c1ba141fb829375527153b0",
+ "58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
+ "59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
+ "5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5e04779cbbae5ce88bb786064f756885dd6895fe",
+ "5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
+ "646c4a654270606256397684204ff0f3d17be2e7",
+ "64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "689b5cb3ddef213d612363a903f10d0358ea64d2",
+ "6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
+ "74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "75c2662a96ccc48891228df7c85eb7d4da9dd621",
+ "771f1ca0007a6fbed5b4a434c73f524f715d33c1",
+ "776859635e9dc01d97b0582f49c814ffbcb019fb",
+ "781dafda896a9f5c30f3d0a011f79a3b79b574c4",
+ "788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
+ "79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7cfc0739be9c49d94272110a0a748256bdde9be6",
+ "7daf61526ec825151f384cc1db510ca5237d5d80",
+ "7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
+ "859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
+ "88edcbab1cac2d70af5870422974afc253f4f0c6",
+ "89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
+ "8dcaf4ef132900dd378f7be526c884b17452713b",
+ "8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
+ "961ec451172f373f919c593737466300e42062cb",
+ "976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "98b02eb70066c182c705ef4d14d8b723ad7f1fab",
+ "993ca31f6974f8387bb18dd7d38987d290da8781",
+ "9dbd05af3442e6f42d67868054751b76973f4171",
+ "a2298c137b9c8c8975bad62eea9224edb95e6952",
+ "a2671738755ab8b24775e95375dc72f1ca4e5fd6",
+ "a26f299fb97c646effeebd4c5e2968786bd0f781",
+ "a48f9b7ad627909f76d780aa4208530304ece42c",
+ "a69665d0b5d3b95f54f68406eee3ed50c67efb45",
+ "a69665d0b5d3b95f54f68406eee3ed50c67efb45",
+ "a8357c31837404f9ebd798999d546c9398ab3648",
+ "a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "ad038725bf6855a79f3c768ebe93c7103d14522f",
+ "aef581bf42e76e527f5aed3b8958fd4e7a24819f",
+ "b2b66b9c7f817a20144456f99c0be805602e8597",
+ "b2d719120306b90eb8dd3580b699a61ec70556f4",
+ "b4b8e18e27f102e59b2be2d58c7b54d0a0eb457a",
+ "b5be7f409a3a2601208c5ce08cf52b9ac1094aae",
+ "b5bf8b7467fb095c90adf3b49aa1687291e4469c",
+ "b8b427e5b3d650ba9e03197f9c3917e25b878930",
+ "bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
+ "be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "c1b583fbd052572f08158d39ffe4d7510dadbebb",
+ "c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
+ "c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
+ "c7687fa6f637c7d32a25be0e772867d87536d35c",
+ "c7d8b37ec99cf0d987e60667f05299f200e18a5d",
+ "c92b9ae9eefa07504950b405625aef54b48f0e1a",
+ "ccb1debcfae006a3fc984e9e91309b9706a5c375",
+ "cd611c765cbb0b3b7cb2fdc07d8f0b9cc93ec257",
+ "cd8a7c3b8d850ebedc1ca791ccb37b9a2689f9c3",
+ "d055c054c330f99ec011e37186d2b429339758fd",
+ "d17b1e254cce82df5c6eb4fd492cef91e7e11558",
+ "d188762a7e3ab5d4ee8a897204316513e4e636ec",
+ "d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
+ "d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "e01bb7256d77aea258313bb410dfcfc10512f420",
+ "e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
+ "e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
+ "e64714a81f60ab9286ec90cad682cb22e564fb6f",
+ "e9d7716b4f94bbc3d94459b5fe9bb8b15cb2e433",
+ "e9e84e17383e93a784a8471708619162b32fb399",
+ "eac7df5f799983d5a7cc55d10b4d426dc557febf",
+ "eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
+ "eb1b39fd7a874896688855a22efddef10272427c",
+ "eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
+ "ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
+ "ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
+ "ef1dfa325c21cff4cd8bb1a9b6c4ee6996d43c8f",
+ "ef6749d9263a01f921ba7d72df0d17671d14e5f6",
+ "f0ea221d8587cede25592266486e119d277f7096",
+ "f68f9a9202a75d2aee35252e104d796f9515001e",
+ "f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "fd9bd560662e070b222d63052830837829c490f0",
+]
+
+@dataclass
+class PdfExtractResult:
+ sha1hex: str
+ status: str
+ error_msg: Optional[str] = None
+ file_meta: Optional[Dict[str,Any]] = None
+ text: Optional[str] = None
+ page0_thumbnail: Optional[bytes] = None
+ has_page0_thumbnail: bool = False
+ meta_xml: Optional[str] = None
+ pdf_info: Optional[Dict[str,Any]] = None
+ pdf_extra: Optional[Dict[str,Any]] = None
+ source: Optional[Dict[str,Any]] = None
+
+ def to_pdftext_dict(self) -> dict:
+ """
+ Outputs a JSON string as would be published to Kafka text/info topic.
+ """
+ return {
+ 'key': self.sha1hex,
+ 'sha1hex': self.sha1hex,
+ 'status': self.status,
+ 'file_meta': self.file_meta,
+ 'error_msg': self.error_msg,
+ 'text': self.text,
+ 'has_page0_thumbnail': self.has_page0_thumbnail,
+ 'meta_xml': self.meta_xml,
+ 'pdf_info': self.pdf_info,
+ 'pdf_extra': self.pdf_extra,
+ 'source': self.source,
+ }
+
+ @classmethod
+ def from_pdftext_dict(cls, record):
+ """
+ Outputs a JSON string as would be published to Kafka text/info topic.
+ """
+ if record['status'] != 'success':
+ return PdfExtractResult(
+ sha1hex=record.get('sha1hex') or record['key'],
+ status=record['status'],
+ error_msg=record.get('error_msg'),
+ )
+ else:
+ return PdfExtractResult(
+ sha1hex=record['sha1hex'],
+ status=record['status'],
+ file_meta=record.get('file_meta'),
+ text=record.get('text'),
+ has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
+ meta_xml=record.get('meta_xml'),
+ pdf_info=record.get('pdf_info'),
+ pdf_extra=record.get('pdf_extra'),
+ )
+
+ @classmethod
+ def from_pdf_meta_dict(cls, record):
+ """
+ Parses what would be returned from postgrest
+ """
+ if record['status'] != 'success':
+ return PdfExtractResult(
+ sha1hex=record['sha1hex'],
+ status=record['status'],
+ error_msg=(record.get('metadata') or {}).get('error_msg'),
+ )
+ else:
+ pdf_extra = dict()
+ for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id', 'pdf_version'):
+ if record.get(k):
+ pdf_extra[k] = record[k]
+ return PdfExtractResult(
+ sha1hex=record['sha1hex'],
+ status=record['status'],
+ has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
+ pdf_info=record.get('metadata'),
+ pdf_extra=pdf_extra,
+ )
+
+ def to_sql_tuple(self) -> tuple:
+ # pdf_meta (sha1hex, updated, status, page0_thumbnail, page_count,
+ # word_count, page0_height, page0_width, permanent_id, pdf_created,
+ # pdf_version, metadata)
+ word_count: Optional[int] = None
+ if self.text:
+ word_count = len(self.text.split())
+ metadata: Optional[Dict] = None
+ pdf_extra = self.pdf_extra or dict()
+ pdf_created = None
+ # TODO: form, encrypted
+ if self.pdf_info:
+ metadata = dict()
+ for k in ('Title', 'Subject', 'Author', 'Creator', 'Producer', 'doi'):
+ if k in self.pdf_info:
+ metadata[k.lower()] = self.pdf_info[k]
+ if 'CreationDate' in self.pdf_info:
+ pdf_created = self.pdf_info['CreationDate']
+ metadata_json: Optional[str] = None
+ if metadata:
+ metadata_json = json.dumps(metadata, sort_keys=True)
+ return (
+ self.sha1hex,
+ datetime.datetime.now(), # updated
+ self.status,
+ self.has_page0_thumbnail,
+ pdf_extra.get('page_count'),
+ word_count,
+ pdf_extra.get('page0_height'),
+ pdf_extra.get('page0_width'),
+ pdf_extra.get('permanent_id'),
+ pdf_created,
+ pdf_extra.get('pdf_version'),
+ metadata_json,
+ )
+
+
+def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult:
+ """
+ A known issue is that output text is in "physical layout" mode, which means
+ columns will be side-by-side. We would prefer a single stream of tokens!
+
+ Tried using page.text(layout_mode=poppler.TextLayout.raw_order_layout)
+ instead of the default mode (poppler.TextLayout.physical_layout), but that
+ didn't seem to work at all (returned empty strings).
+ """
+ file_meta = gen_file_metadata(blob)
+ sha1hex = file_meta['sha1hex']
+ if file_meta['mimetype'] != 'application/pdf':
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='not-pdf',
+ error_msg=f"mimetype is '{file_meta['mimetype']}'",
+ file_meta=file_meta,
+ )
+
+ if sha1hex in BAD_PDF_SHA1HEX:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='bad-pdf',
+ error_msg=f"PDF known to cause processing issues",
+ file_meta=file_meta,
+ )
+
+ print(f"\tpoppler processing: {sha1hex}", file=sys.stderr)
+ try:
+ pdf = poppler.load_from_data(blob)
+ if pdf is None:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='empty-pdf',
+ file_meta=file_meta,
+ has_page0_thumbnail=False,
+ )
+ page0 = pdf.create_page(0)
+ if page0 is None:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='empty-page0',
+ file_meta=file_meta,
+ )
+ # this call sometimes fails an returns an AttributeError
+ page0rect = page0.page_rect()
+ except (AttributeError, poppler.document.LockedDocumentError) as e:
+ # may need to expand the set of exceptions caught here over time, but
+ # starting with a narrow set
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='parse-error',
+ error_msg=str(e),
+ file_meta=file_meta,
+ )
+
+ assert page0 is not None
+ page0_thumbnail: Optional[bytes] = None
+ renderer = poppler.PageRenderer()
+ try:
+ full_img = renderer.render_page(page0)
+ img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "BGRA", 0, 1)
+ img.thumbnail(thumb_size, Image.BICUBIC)
+ buf = BytesIO()
+ img.save(buf, thumb_type)
+ page0_thumbnail = buf.getvalue()
+ # assuming that very small images mean something went wrong
+ if page0_thumbnail is None or len(page0_thumbnail) < 50:
+ page0_thumbnail = None
+ except Exception as e:
+ print(str(e), file=sys.stderr)
+ page0_thumbnail = None
+
+ try:
+ full_text = page0.text()
+ for n in range(1, pdf.pages):
+ pageN = pdf.create_page(n)
+ full_text += pageN.text()
+ except AttributeError as e:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='parse-error',
+ error_msg=str(e),
+ file_meta=file_meta,
+ )
+
+ # Kafka message size limit; cap at about 1 MByte
+ if len(full_text)> 1000000:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='text-too-large',
+ error_msg="full_text chars: {}".format(len(full_text)),
+ file_meta=file_meta,
+ )
+ if len(pdf.metadata)> 1000000:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='text-too-large',
+ error_msg="meta_xml chars: {}".format(len(full_text)),
+ file_meta=file_meta,
+ )
+
+ try:
+ pdf_info = pdf.infos()
+ except UnicodeDecodeError:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status='bad-unicode',
+ error_msg="in infos()",
+ file_meta=file_meta,
+ )
+
+ # TODO: is this actually needed? or does json marshalling work automatically?
+ for k in pdf_info.keys():
+ if isinstance(pdf_info[k], datetime.datetime):
+ pdf_info[k] = datetime.datetime.isoformat(pdf_info[k])
+
+ permanent_id: Optional[str] = None
+ update_id: Optional[str] = None
+ try:
+ permanent_id = pdf.pdf_id.permanent_id
+ update_id = pdf.pdf_id.update_id
+ except TypeError:
+ pass
+
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ file_meta=file_meta,
+ status='success',
+ error_msg=None,
+ text=full_text or None,
+ has_page0_thumbnail=page0_thumbnail is not None,
+ page0_thumbnail=page0_thumbnail,
+ meta_xml=pdf.metadata or None,
+ pdf_info=pdf_info,
+ pdf_extra=dict(
+ page0_height=page0rect.height,
+ page0_width=page0rect.width,
+ page_count=pdf.pages,
+ permanent_id=permanent_id,
+ update_id=update_id,
+ pdf_version=f"{pdf.pdf_version[0]}.{pdf.pdf_version[1]}",
+ ),
+ )
+
+class PdfExtractWorker(SandcrawlerFetchWorker):
+
+ def __init__(self, wayback_client=None, sink=None, **kwargs):
+ super().__init__(wayback_client=wayback_client)
+ self.wayback_client = wayback_client
+ self.sink = sink
+ self.thumbnail_sink = kwargs.get('thumbnail_sink')
+
+ def timeout_response(self, task) -> Dict:
+ default_key = task['sha1hex']
+ return dict(
+ status="error-timeout",
+ error_msg="internal pdf-extract worker timeout",
+ source=task,
+ sha1hex=default_key,
+ )
+
+ def process(self, record, key: Optional[str] = None):
+ default_key = record['sha1hex']
+
+ fetch_result = self.fetch_blob(record)
+ if fetch_result['status'] != 'success':
+ return fetch_result
+ blob = fetch_result['blob']
+
+ result = process_pdf(blob)
+ result.source = record
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ return result.to_pdftext_dict()
+
+class PdfExtractBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like PdfExtractWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, sink=None, **kwargs):
+ super().__init__()
+ self.sink = sink
+ self.thumbnail_sink = kwargs.get('thumbnail_sink')
+
+ def process(self, blob, key: Optional[str] = None):
+ if not blob:
+ return None
+ assert isinstance(blob, bytes)
+
+ result = process_pdf(blob)
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+
+ return result.to_pdftext_dict()
+
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
new file mode 100644
index 0000000..161dc9c
--- /dev/null
+++ b/python/sandcrawler/pdftrio.py
@@ -0,0 +1,130 @@
+
+import time
+import requests
+
+from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .misc import gen_file_metadata, requests_retry_session
+
+
+class PdfTrioClient(object):
+
+ def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ self.host_url = host_url
+ self.http_session = requests_retry_session(retries=3, backoff_factor=3)
+
+ def classify_pdf(self, blob, mode="auto"):
+ """
+ Returns a dict with at least:
+
+ - status_code (int, always set)
+ - status (success, or error-*)
+
+ On success, the other remote API JSON response keys are also included.
+
+ On HTTP-level failures, the status_code and status field are set
+ appropriately; an optional `error_msg` may also be set. For some other
+ errors, like connection failure, an exception is raised.
+ """
+ assert blob
+
+ try:
+ pdftrio_response = requests.post(
+ self.host_url + "/classify/research-pub/" + mode,
+ files={
+ 'pdf_content': blob,
+ },
+ timeout=60.0,
+ )
+ except requests.Timeout:
+ return {
+ 'status': 'error-timeout',
+ 'status_code': -4, # heritrix3 "HTTP timeout" code
+ 'error_msg': 'pdftrio request (HTTP POST) timeout',
+ }
+ except requests.exceptions.ConnectionError:
+ # crude back-off
+ time.sleep(2.0)
+ return {
+ 'status': 'error-connect',
+ 'status_code': -2, # heritrix3 "HTTP connect" code
+ 'error_msg': 'pdftrio request connection timout',
+ }
+
+ info = dict(
+ status_code=pdftrio_response.status_code,
+ )
+ if pdftrio_response.status_code == 200:
+ resp_json = pdftrio_response.json()
+ assert 'ensemble_score' in resp_json
+ assert 'status' in resp_json
+ assert 'versions' in resp_json
+ info.update(resp_json)
+ else:
+ info['status'] = 'error'
+ # TODO: might return JSON with some info?
+
+ info['_total_sec'] = pdftrio_response.elapsed.total_seconds()
+ return info
+
+
+class PdfTrioWorker(SandcrawlerFetchWorker):
+ """
+ This class is basically copied directly from GrobidWorker
+ """
+
+ def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
+ super().__init__(wayback_client=wayback_client)
+ self.pdftrio_client = pdftrio_client
+ self.sink = sink
+
+ def process(self, record, key=None):
+ start_process = time.time()
+ default_key = record['sha1hex']
+ fetch_sec = None
+
+ start = time.time()
+ fetch_result = self.fetch_blob(record)
+ fetch_sec = time.time() - start
+ if fetch_result['status'] != 'success':
+ return fetch_result
+ blob = fetch_result['blob']
+
+ result = dict()
+ result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
+ result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
+ result['source'] = record
+ result['timing'] = dict(
+ pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ total_sec=time.time() - start_process,
+ )
+ if fetch_sec:
+ result['timing']['fetch_sec'] = fetch_sec
+ return result
+
+class PdfTrioBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like PdfTrioWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
+ super().__init__()
+ self.pdftrio_client = pdftrio_client
+ self.sink = sink
+ self.mode = mode
+
+ def process(self, blob, key=None):
+ start_process = time.time()
+ if not blob:
+ return None
+ result = dict()
+ result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
+ result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+ result['timing'] = dict(
+ pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ total_sec=time.time() - start_process,
+ )
+ return result
+
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
new file mode 100644
index 0000000..a388b90
--- /dev/null
+++ b/python/sandcrawler/persist.py
@@ -0,0 +1,584 @@
+
+"""
+cdx
+- read raw CDX, filter
+- push to SQL table
+
+ingest-file-result
+- read JSON format (batch)
+- cdx SQL push batch (on conflict skip)
+- file_meta SQL push batch (on conflict update)
+- ingest request push batch (on conflict skip)
+- ingest result push batch (on conflict update)
+
+grobid
+- reads JSON format (batch)
+- grobid2json
+- minio push (one-by-one)
+- grobid SQL push batch (on conflict update)
+- file_meta SQL push batch (on conflict update)
+"""
+
+import os
+from typing import Optional, AnyStr
+import xml.etree.ElementTree
+
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.db import SandcrawlerPostgresClient
+from sandcrawler.minio import SandcrawlerMinioClient
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.html_ingest import HtmlMetaRow
+
+
+class PersistCdxWorker(SandcrawlerWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record, key=None):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+ # filter to full CDX lines, no liveweb
+ cdx_batch = [r for r in batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ resp = self.db.insert_cdx(self.cur, cdx_batch)
+ if len(cdx_batch) < len(batch):
+ self.counts['skip'] += len(batch) - len(cdx_batch)
+ self.counts['insert-cdx'] += resp[0]
+ self.counts['update-cdx'] += resp[1]
+ self.db.commit()
+ return []
+
+class PersistIngestFileResultWorker(SandcrawlerWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record, key=None):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def request_to_row(self, raw):
+ """
+ Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema
+
+ if there is a problem with conversion, return None
+ """
+ # backwards compat hacks; transform request to look like current schema
+ if raw.get('ingest_type') == 'file':
+ raw['ingest_type'] = 'pdf'
+ if (not raw.get('link_source')
+ and raw.get('base_url')
+ and raw.get('ext_ids', {}).get('doi')
+ and raw['base_url'] == "https://doi.org/{}".format(raw['ext_ids']['doi'])):
+ # set link_source(_id) for old ingest requests
+ raw['link_source'] = 'doi'
+ raw['link_source_id'] = raw['ext_ids']['doi']
+ if (not raw.get('link_source')
+ and raw.get('ingest_request_source', '').startswith('savepapernow')
+ and raw.get('fatcat', {}).get('release_ident')):
+ # set link_source(_id) for old ingest requests
+ raw['link_source'] = 'spn'
+ raw['link_source_id'] = raw['fatcat']['release_ident']
+
+ for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'):
+ if not k in raw:
+ self.counts['skip-request-fields'] += 1
+ return None
+ if raw['ingest_type'] not in ('pdf', 'xml', 'html'):
+ self.counts['skip-ingest-type'] += 1
+ return None
+ request = {
+ 'ingest_type': raw['ingest_type'],
+ 'base_url': raw['base_url'],
+ 'link_source': raw['link_source'],
+ 'link_source_id': raw['link_source_id'],
+ 'ingest_request_source': raw.get('ingest_request_source'),
+ 'request': {},
+ }
+ # extra/optional fields
+ if raw.get('release_stage'):
+ request['release_stage'] = raw['release_stage']
+ if raw.get('fatcat', {}).get('release_ident'):
+ request['request']['release_ident'] = raw['fatcat']['release_ident']
+ for k in ('ext_ids', 'edit_extra', 'rel'):
+ if raw.get(k):
+ request['request'][k] = raw[k]
+ # if this dict is empty, trim it to save DB space
+ if not request['request']:
+ request['request'] = None
+ return request
+
+
+ def file_result_to_row(self, raw: dict) -> Optional[dict]:
+ """
+ Converts ingest-result JSON schema (eg, from Kafka) to SQL ingest_file_result schema
+
+ if there is a problem with conversion, return None and set skip count
+ """
+ for k in ('request', 'hit', 'status'):
+ if not k in raw:
+ self.counts['skip-result-fields'] += 1
+ return None
+ if not 'base_url' in raw['request']:
+ self.counts['skip-result-fields'] += 1
+ return None
+ ingest_type = raw['request'].get('ingest_type')
+ if ingest_type == 'file':
+ ingest_type = 'pdf'
+ if ingest_type not in ('pdf', 'xml', 'html'):
+ self.counts['skip-ingest-type'] += 1
+ return None
+ if raw['status'] in ("existing", ):
+ self.counts['skip-existing'] += 1
+ return None
+ result = {
+ 'ingest_type': ingest_type,
+ 'base_url': raw['request']['base_url'],
+ 'hit': raw['hit'],
+ 'status': raw['status'],
+ }
+ terminal = raw.get('terminal')
+ if terminal:
+ result['terminal_url'] = terminal.get('terminal_url') or terminal.get('url')
+ result['terminal_dt'] = terminal.get('terminal_dt')
+ result['terminal_status_code'] = terminal.get('terminal_status_code') or terminal.get('status_code') or terminal.get('http_code')
+ if result['terminal_status_code']:
+ result['terminal_status_code'] = int(result['terminal_status_code'])
+ result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
+ if len(result['terminal_url']) > 2048:
+ # postgresql13 doesn't like extremely large URLs in b-tree index
+ self.counts['skip-huge-url'] += 1
+ return None
+ return result
+
+ def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
+ html_body = record.get('html_body')
+ file_meta = record.get('file_meta')
+ if not (file_meta and html_body):
+ return None
+ return HtmlMetaRow(
+ sha1hex=file_meta["sha1hex"],
+ status=record.get('status'),
+ scope=record.get('scope'),
+ has_teixml=bool(html_body and html_body['status'] == 'success'),
+ has_thumbnail=False, # TODO
+ word_count=(html_body and html_body.get('word_count')) or None,
+ biblio=record.get('html_biblio'),
+ resources=record.get('html_resources'),
+ )
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+
+ if not batch:
+ return []
+
+ results = [self.file_result_to_row(raw) for raw in batch]
+ results = [r for r in results if r]
+
+ requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
+ requests = [r for r in requests if r]
+
+ if requests:
+ resp = self.db.insert_ingest_request(self.cur, requests)
+ self.counts['insert-requests'] += resp[0]
+ self.counts['update-requests'] += resp[1]
+ if results:
+ resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update")
+ self.counts['insert-results'] += resp[0]
+ self.counts['update-results'] += resp[1]
+
+ # these schemas match, so can just pass through
+ cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx')]
+ revisit_cdx_batch = [r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx')]
+ cdx_batch.extend(revisit_cdx_batch)
+ # filter to full CDX lines, with full warc_paths (not liveweb)
+ cdx_batch = [r for r in cdx_batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ if cdx_batch:
+ resp = self.db.insert_cdx(self.cur, cdx_batch)
+ self.counts['insert-cdx'] += resp[0]
+ self.counts['update-cdx'] += resp[1]
+
+ file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
+ if file_meta_batch:
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing")
+ self.counts['insert-file_meta'] += resp[0]
+ self.counts['update-file_meta'] += resp[1]
+
+ html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')]
+ if html_meta_batch:
+ resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="update")
+ self.counts['insert-html_meta'] += resp[0]
+ self.counts['update-html_meta'] += resp[1]
+
+ self.db.commit()
+ return []
+
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__(db_url=db_url)
+
+ def process(self, record, key=None):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+
+ if not batch:
+ return []
+
+ requests = [self.request_to_row(raw) for raw in batch]
+ requests = [r for r in requests if r]
+
+ if requests:
+ resp = self.db.insert_ingest_request(self.cur, requests)
+ self.counts['insert-requests'] += resp[0]
+ self.counts['update-requests'] += resp[1]
+
+ self.db.commit()
+ return []
+
+class PersistGrobidWorker(SandcrawlerWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.grobid = GrobidClient()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get('s3_url', 'localhost:9000'),
+ access_key=kwargs['s3_access_key'],
+ secret_key=kwargs['s3_secret_key'],
+ default_bucket=kwargs['s3_bucket'],
+ )
+ self.s3_only = kwargs.get('s3_only', False)
+ self.db_only = kwargs.get('db_only', False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
+
+ def process(self, record, key=None):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+
+ # filter out bad "missing status_code" timeout rows
+ missing = [r for r in batch if not r.get('status_code')]
+ if missing:
+ self.counts['skip-missing-status'] += len(missing)
+ batch = [r for r in batch if r.get('status_code')]
+
+ for r in batch:
+ if r['status_code'] != 200 or not r.get('tei_xml'):
+ self.counts['s3-skip-status'] += 1
+ if r.get('error_msg'):
+ r['metadata'] = {'error_msg': r['error_msg'][:500]}
+ continue
+
+ assert len(r['key']) == 40
+ if not self.db_only:
+ resp = self.s3.put_blob(
+ folder="grobid",
+ blob=r['tei_xml'],
+ sha1hex=r['key'],
+ extension=".tei.xml",
+ )
+ self.counts['s3-put'] += 1
+
+ # enhance with teixml2json metadata, if available
+ try:
+ metadata = self.grobid.metadata(r)
+ except xml.etree.ElementTree.ParseError as xml_e:
+ r['status'] = 'bad-grobid-xml'
+ r['metadata'] = {'error_msg': str(xml_e)[:1024]}
+ continue
+ if not metadata:
+ continue
+ for k in ('fatcat_release', 'grobid_version'):
+ r[k] = metadata.pop(k, None)
+ if r.get('fatcat_release'):
+ r['fatcat_release'] = r['fatcat_release'].replace('release_', '')
+ if metadata.get('grobid_timestamp'):
+ r['updated'] = metadata['grobid_timestamp']
+ r['metadata'] = metadata
+
+ if not self.s3_only:
+ resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
+ self.counts['insert-grobid'] += resp[0]
+ self.counts['update-grobid'] += resp[1]
+
+ file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
+ self.counts['insert-file-meta'] += resp[0]
+ self.counts['update-file-meta'] += resp[1]
+
+ self.db.commit()
+
+ return []
+
+
+class PersistGrobidDiskWorker(SandcrawlerWorker):
+ """
+ Writes blobs out to disk.
+
+ This could be refactored into a "Sink" type with an even thinner wrapper.
+ """
+
+ def __init__(self, output_dir):
+ super().__init__()
+ self.output_dir = output_dir
+
+ def _blob_path(self, sha1hex, extension=".tei.xml"):
+ obj_path = "{}/{}/{}{}".format(
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def process(self, record, key=None):
+
+ if record.get('status_code') != 200 or not record.get('tei_xml'):
+ return False
+ assert(len(record['key'])) == 40
+ p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))
+ os.makedirs(os.path.dirname(p), exist_ok=True)
+ with open(p, 'w') as f:
+ f.write(record.pop('tei_xml'))
+ self.counts['written'] += 1
+ return record
+
+
+class PersistPdfTrioWorker(SandcrawlerWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record, key=None):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+
+ batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')]
+ for r in batch:
+ # copy key (sha1hex) into sub-object
+ r['pdf_trio']['key'] = r['key']
+ pdftrio_batch = [r['pdf_trio'] for r in batch]
+ resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
+ self.counts['insert-pdftrio'] += resp[0]
+ self.counts['update-pdftrio'] += resp[1]
+
+ file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+ self.counts['insert-file-meta'] += resp[0]
+ self.counts['update-file-meta'] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistPdfTextWorker(SandcrawlerWorker):
+ """
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table.
+
+ Should keep batch sizes small.
+ """
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get('s3_url', 'localhost:9000'),
+ access_key=kwargs['s3_access_key'],
+ secret_key=kwargs['s3_secret_key'],
+ default_bucket=kwargs['s3_bucket'],
+ )
+ self.s3_only = kwargs.get('s3_only', False)
+ self.db_only = kwargs.get('db_only', False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
+
+ def process(self, record, key=None):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+
+ parsed_batch = []
+ for r in batch:
+ parsed_batch.append(PdfExtractResult.from_pdftext_dict(r))
+
+ for r in parsed_batch:
+ if r.status != 'success' or not r.text:
+ self.counts['s3-skip-status'] += 1
+ if r.error_msg:
+ r.metadata = {'error_msg': r.error_msg[:500]}
+ continue
+
+ assert len(r.sha1hex) == 40
+ if not self.db_only:
+ resp = self.s3.put_blob(
+ folder="text",
+ blob=r.text,
+ sha1hex=r.sha1hex,
+ extension=".txt",
+ )
+ self.counts['s3-put'] += 1
+
+ if not self.s3_only:
+ resp = self.db.insert_pdf_meta(self.cur, parsed_batch, on_conflict="update")
+ self.counts['insert-pdf-meta'] += resp[0]
+ self.counts['update-pdf-meta'] += resp[1]
+
+ file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
+ self.counts['insert-file-meta'] += resp[0]
+ self.counts['update-file-meta'] += resp[1]
+
+ self.db.commit()
+
+ return []
+
+
+class PersistThumbnailWorker(SandcrawlerWorker):
+ """
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL
+ table.
+
+ This worker *must* be used with raw kakfa mode; thumbnails are *not*
+ wrapped in JSON like most sandcrawler kafka messages.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get('s3_url', 'localhost:9000'),
+ access_key=kwargs['s3_access_key'],
+ secret_key=kwargs['s3_secret_key'],
+ default_bucket=kwargs['s3_bucket'],
+ )
+ self.s3_extension = kwargs.get('s3_extension', ".jpg")
+ self.s3_folder = kwargs.get('s3_folder', "pdf")
+
+ def process(self, blob: bytes, key: Optional[str] = None):
+ """
+ Processing raw messages, not decoded JSON objects
+ """
+
+ if isinstance(key, bytes):
+ key = key.decode('utf-8')
+ assert key is not None and len(key) == 40 and isinstance(key, str)
+ assert isinstance(blob, bytes)
+ assert len(blob) >= 50
+
+ resp = self.s3.put_blob(
+ folder=self.s3_folder,
+ blob=blob,
+ sha1hex=key,
+ extension=self.s3_extension,
+ )
+ self.counts['s3-put'] += 1
+
+
+class GenericPersistDocWorker(SandcrawlerWorker):
+ """
+ Pushes blobs from Kafka to S3.
+
+ Objects are assumed to be JSON-wrapped strings.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get('s3_url', 'localhost:9000'),
+ access_key=kwargs['s3_access_key'],
+ secret_key=kwargs['s3_secret_key'],
+ default_bucket=kwargs['s3_bucket'],
+ )
+ self.s3_extension = kwargs.get('s3_extension', ".unknown")
+ self.s3_folder = kwargs.get('s3_folder', "unknown")
+ self.doc_key = "unknown"
+
+ def process(self, record: dict, key: Optional[AnyStr] = None) -> None:
+
+ if record.get('status') != 'success' or not record.get(self.doc_key):
+ return
+
+ assert key is not None
+ if isinstance(key, bytes):
+ key_str = key.decode('utf-8')
+ elif isinstance(key, str):
+ key_str = key
+ assert len(key_str) == 40
+ if 'sha1hex' in record:
+ assert key_str == record['sha1hex']
+
+ resp = self.s3.put_blob(
+ folder=self.s3_folder,
+ blob=record[self.doc_key].encode('utf-8'),
+ sha1hex=key_str,
+ extension=self.s3_extension,
+ )
+ self.counts['s3-put'] += 1
+
+
+class PersistXmlDocWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get('s3_extension', ".jats.xml")
+ self.s3_folder = kwargs.get('s3_folder', "xml_doc")
+ self.doc_key = "jats_xml"
+
+
+class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get('s3_extension', ".tei.xml")
+ self.s3_folder = kwargs.get('s3_folder', "html_body")
+ self.doc_key = "tei_xml"
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
new file mode 100644
index 0000000..37e3d7a
--- /dev/null
+++ b/python/sandcrawler/workers.py
@@ -0,0 +1,625 @@
+
+import sys
+import json
+import time
+import signal
+import zipfile
+import requests
+import multiprocessing.pool
+from collections import Counter
+from confluent_kafka import Consumer, Producer, KafkaException
+
+from .misc import parse_cdx_line
+from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
+
+
+class SandcrawlerWorker(object):
+ """
+ Base class for sandcrawler workers.
+
+ Usually these get "pushed" into by a RecordPusher. Output goes to another
+ worker (pipeline-style), or defaults to stdout.
+ """
+
+ def __init__(self):
+ self.counts = Counter()
+ self.sink = None
+ # TODO: self.counters
+
+ def push_record(self, task, key=None):
+ self.counts['total'] += 1
+ if not self.want(task):
+ self.counts['skip'] += 1
+ return
+ result = self.process(task, key=key)
+ if not result:
+ self.counts['failed'] += 1
+ return
+ elif type(result) == dict and 'status' in result and len(result['status']) < 32:
+ self.counts[result['status']] += 1
+
+ if self.sink:
+ self.sink.push_record(result)
+ self.counts['pushed'] += 1
+ else:
+ print(json.dumps(result))
+ return result
+
+ def timeout_response(self, task):
+ """
+ This should be overridden by workers that want to return something
+ meaningful when there is a processing timeout. Eg, JSON vs some other
+ error message.
+ """
+ return None
+
+ def push_record_timeout(self, task, key=None, timeout=300):
+ """
+ A wrapper around self.push_record which sets a timeout.
+
+ Note that this uses signals and *will behave wrong/weirdly* with
+ multithreading or if signal-based timeouts are used elsewhere in the
+ same process.
+ """
+
+ def timeout_handler(signum, frame):
+ raise TimeoutError("timeout processing record")
+ signal.signal(signal.SIGALRM, timeout_handler)
+ resp = None
+ signal.alarm(int(timeout))
+ try:
+ resp = self.push_record(task, key=key)
+ except TimeoutError:
+ self.counts['timeout'] += 1
+ resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+ # TODO: what if it is this push_record() itself that is timing out?
+ if resp and self.sink:
+ self.sink.push_record(resp)
+ self.counts['pushed'] += 1
+ elif resp:
+ print(json.dumps(resp))
+ finally:
+ signal.alarm(0)
+ return resp
+
+ def push_batch(self, tasks):
+ results = []
+ for task in tasks:
+ results.append(self.push_record(task))
+ return results
+
+ def finish(self):
+ if self.sink:
+ self.sink.finish()
+ print("Worker: {}".format(self.counts), file=sys.stderr)
+ return self.counts
+
+ def want(self, task):
+ """
+ Optionally override this as a filter in implementations.
+ """
+ return True
+
+ def process(self, task, key=None):
+ """
+ Derived workers need to implement business logic here.
+ """
+ raise NotImplementedError('implementation required')
+
+
+class SandcrawlerFetchWorker(SandcrawlerWorker):
+ """
+ Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg,
+ PDFs) from wayback, archive.org, or other sources.
+ """
+
+ def __init__(self, wayback_client, **kwargs):
+ super().__init__(**kwargs)
+ self.wayback_client = wayback_client
+
+ def fetch_blob(self, record):
+ start_process = time.time()
+ default_key = record['sha1hex']
+ wayback_sec = None
+ petabox_sec = None
+
+ if record.get('warc_path') and record.get('warc_offset'):
+ # it's a full CDX dict. fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this PdfTrioWorker")
+ try:
+ start = time.time()
+ blob = self.wayback_client.fetch_petabox_body(
+ csize=record['warc_csize'],
+ offset=record['warc_offset'],
+ warc_path=record['warc_path'],
+ )
+ wayback_sec = time.time() - start
+ except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
+ return dict(
+ key=default_key,
+ source=record,
+ status="error-wayback",
+ error_msg=str(we),
+ )
+ elif record.get('url') and record.get('datetime'):
+ # it's a partial CDX dict or something? fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this PdfTrioWorker")
+ try:
+ start = time.time()
+ blob = self.wayback_client.fetch_replay_body(
+ url=record['url'],
+ datetime=record['datetime'],
+ )
+ wayback_sec = time.time() - start
+ except (WaybackError, WaybackContentError) as we:
+ return dict(
+ key=default_key,
+ source=record,
+ status="error-wayback",
+ error_msg=str(we),
+ )
+ elif record.get('item') and record.get('path'):
+ # it's petabox link; fetch via HTTP
+ start = time.time()
+ resp = requests.get("https://archive.org/serve/{}/{}".format(
+ record['item'], record['path']))
+ petabox_sec = time.time() - start
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ return dict(
+ key=default_key,
+ source=record,
+ status="error-petabox",
+ error_msg=str(e),
+ )
+ blob = resp.content
+ else:
+ raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+ if not blob:
+ return dict(
+ key=default_key,
+ source=record,
+ status="empty-blob",
+ )
+ return dict(
+ key=default_key,
+ status="success",
+ source=record,
+ blob=blob,
+ )
+
+class MultiprocessWrapper(SandcrawlerWorker):
+
+ def __init__(self, worker, sink, jobs=None):
+ self.counts = Counter()
+ self.worker = worker
+ self.sink = sink
+ self.pool = multiprocessing.pool.Pool(jobs)
+
+ def push_batch(self, tasks):
+ self.counts['total'] += len(tasks)
+ print("... processing batch of: {}".format(len(tasks)), file=sys.stderr)
+ results = self.pool.map(self.worker.process, tasks)
+ for result in results:
+ if not result:
+ self.counts['failed'] += 1
+ return
+ elif type(result) == dict and 'status' in result and len(result['status']) < 32:
+ self.counts[result['status']] += 1
+
+ if self.sink:
+ self.sink.push_record(result)
+ self.counts['pushed'] += 1
+ else:
+ print(json.dumps(result))
+ return results
+
+ def finish(self):
+ self.pool.terminate()
+ if self.sink:
+ self.sink.finish()
+ worker_counts = self.worker.finish()
+ print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
+ return worker_counts
+
+class BlackholeSink(SandcrawlerWorker):
+ """
+ Dummy SandcrawlerWorker. That doesn't do or process anything.
+
+ Useful for tests.
+ """
+
+ def push_record(self, task, key=None):
+ return
+
+ def push_batch(self, tasks):
+ return
+
+class KafkaSink(SandcrawlerWorker):
+
+ def __init__(self, kafka_hosts, produce_topic, **kwargs):
+ self.sink = None
+ self.counts = Counter()
+ self.produce_topic = produce_topic
+ self.kafka_hosts = kafka_hosts
+
+ config = self.producer_config({
+ 'bootstrap.servers': kafka_hosts,
+ 'message.max.bytes': 30000000, # ~30 MBytes; broker is ~50 MBytes
+ 'api.version.request': True,
+ 'api.version.fallback.ms': 0,
+ })
+ self.producer = Producer(config)
+
+
+ @staticmethod
+ def _fail_fast(err, msg):
+ if err is not None:
+ print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+
+ def producer_config(self, kafka_config):
+ config = kafka_config.copy()
+ config.update({
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'message.timeout.ms': 30000,
+ 'request.required.acks': -1, # all brokers must confirm
+ }
+ })
+ return config
+
+ def push_record(self, msg, key=None):
+ self.counts['total'] += 1
+ if type(msg) == dict:
+ if not key and 'key' in msg:
+ key = msg['key']
+ msg = json.dumps(msg)
+ if type(msg) == str:
+ msg = msg.encode('utf-8')
+ assert type(msg) == bytes
+
+ self.producer.produce(
+ self.produce_topic,
+ msg,
+ key=key,
+ on_delivery=self._fail_fast)
+ self.counts['produced'] += 1
+
+ # check for errors etc
+ self.producer.poll(0)
+
+ def push_batch(self, msgs):
+ for m in msgs:
+ self.push_record(m)
+
+ def finish(self):
+ self.producer.flush()
+ return self.counts
+
+
+class KafkaCompressSink(KafkaSink):
+ """
+ Variant of KafkaSink for large documents. Used for, eg, GROBID output.
+ """
+
+ def producer_config(self, kafka_config):
+ config = kafka_config.copy()
+ config.update({
+ 'compression.codec': 'gzip',
+ 'retry.backoff.ms': 250,
+ 'linger.ms': 1000,
+ 'batch.num.messages': 50,
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'message.timeout.ms': 30000,
+ 'request.required.acks': -1, # all brokers must confirm
+ }
+ })
+ return config
+
+
+class RecordPusher:
+ """
+ Base class for different record sources to be pushed into workers. Pretty
+ trivial interface, just wraps an importer and pushes records in to it.
+ """
+
+ def __init__(self, worker, **kwargs):
+ self.counts = Counter()
+ self.worker = worker
+
+ def run(self):
+ """
+ This will look something like:
+
+ for line in sys.stdin:
+ record = json.loads(line)
+ self.worker.push_record(record)
+ print(self.worker.finish())
+ """
+ raise NotImplementedError
+
+
+class JsonLinePusher(RecordPusher):
+
+ def __init__(self, worker, json_file, **kwargs):
+ self.counts = Counter()
+ self.worker = worker
+ self.json_file = json_file
+ self.batch_size = kwargs.get('batch_size', None)
+ if self.batch_size in (0, 1):
+ self.batch_size = None
+
+ def run(self):
+ batch = []
+ for line in self.json_file:
+ if not line:
+ continue
+ self.counts['total'] += 1
+ try:
+ record = json.loads(line)
+ except json.decoder.JSONDecodeError:
+ self.counts['error-json-decode'] += 1
+ continue
+ if self.batch_size:
+ batch.append(record)
+ if len(batch) >= self.batch_size:
+ self.worker.push_batch(batch)
+ self.counts['pushed'] += len(batch)
+ batch = []
+ else:
+ self.worker.push_record(record)
+ self.counts['pushed'] += 1
+ if self.batch_size and batch:
+ self.worker.push_batch(batch)
+ self.counts['pushed'] += len(batch)
+ batch = []
+ worker_counts = self.worker.finish()
+ print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
+ return self.counts
+
+
+class CdxLinePusher(RecordPusher):
+
+ def __init__(self, worker, cdx_file, **kwargs):
+ self.counts = Counter()
+ self.worker = worker
+ self.cdx_file = cdx_file
+ self.filter_http_statuses = kwargs.get('filter_http_statuses', None)
+ self.filter_mimetypes = kwargs.get('filter_mimetypes', None)
+ self.allow_octet_stream = kwargs.get('allow_octet_stream', False)
+ self.batch_size = kwargs.get('batch_size', None)
+ if self.batch_size in (0, 1):
+ self.batch_size = None
+
+ def run(self):
+ batch = []
+ for line in self.cdx_file:
+ if not line:
+ continue
+ self.counts['total'] += 1
+ record = parse_cdx_line(line, normalize=True)
+ if not record:
+ self.counts['skip-parse'] += 1
+ continue
+ if self.filter_http_statuses and record['http_status'] not in self.filter_http_statuses:
+ self.counts['skip-http_status'] += 1
+ continue
+ if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes:
+ self.counts['skip-mimetype'] += 1
+ continue
+ if self.batch_size:
+ batch.append(record)
+ if len(batch) >= self.batch_size:
+ self.worker.push_batch(batch)
+ self.counts['pushed'] += len(batch)
+ batch = []
+ else:
+ self.worker.push_record(record)
+ self.counts['pushed'] += 1
+ if self.batch_size and batch:
+ self.worker.push_batch(batch)
+ self.counts['pushed'] += len(batch)
+ batch = []
+ worker_counts = self.worker.finish()
+ print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
+ return self.counts
+
+
+class ZipfilePusher(RecordPusher):
+
+ def __init__(self, worker, zipfile_path, **kwargs):
+ self.counts = Counter()
+ self.worker = worker
+ self.filter_suffix = ".pdf"
+ self.zipfile_path = zipfile_path
+ self.batch_size = kwargs.get('batch_size', None)
+ if self.batch_size in (0, 1):
+ self.batch_size = None
+
+ def run(self):
+ batch = []
+ with zipfile.ZipFile(self.zipfile_path, 'r') as archive:
+ for zipinfo in archive.infolist():
+ if not zipinfo.filename.endswith(self.filter_suffix):
+ continue
+ self.counts['total'] += 1
+ # NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it
+ flo = archive.open(zipinfo, 'r')
+ data = flo.read(2**32)
+ flo.close()
+ if self.batch_size:
+ batch.append(data)
+ if len(batch) >= self.batch_size:
+ self.worker.push_batch(batch)
+ self.counts['pushed'] += len(batch)
+ batch = []
+ else:
+ self.worker.push_record(data)
+ self.counts['pushed'] += 1
+ if self.batch_size and batch:
+ self.worker.push_batch(batch)
+ self.counts['pushed'] += len(batch)
+ batch = []
+ worker_counts = self.worker.finish()
+ print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
+ return self.counts
+
+class KafkaJsonPusher(RecordPusher):
+
+ def __init__(self, worker, kafka_hosts, consume_topic, group, **kwargs):
+ self.counts = Counter()
+ self.worker = worker
+ self.consumer = make_kafka_consumer(
+ kafka_hosts,
+ consume_topic,
+ group,
+ )
+ self.push_batches = kwargs.get('push_batches', False)
+ self.raw_records = kwargs.get('raw_records', False)
+ self.poll_interval = kwargs.get('poll_interval', 5.0)
+ self.batch_size = kwargs.get('batch_size', 100)
+ if self.batch_size in (0, 1):
+ self.batch_size = 1
+ self.batch_worker = kwargs.get('batch_worker', False)
+ self.process_timeout_sec = kwargs.get('process_timeout_sec', 300)
+
+ def run(self):
+ while True:
+ # TODO: this is batch-oriented, because underlying worker is
+ # often batch-oriented, but this doesn't confirm that entire batch
+ # has been pushed to fatcat before commiting offset. Eg, consider
+ # case where there there is one update and thousands of creates;
+ # update would be lingering in worker, and if worker crashed
+ # never created. Not great.
+ batch = self.consumer.consume(
+ num_messages=self.batch_size,
+ timeout=self.poll_interval)
+ print("... got {} kafka messages ({}sec poll interval)".format(
+ len(batch), self.poll_interval),
+ file=sys.stderr)
+ if not batch:
+ # TODO: could have some larger timeout here and
+ # self.worker.finish() if it's been more than, eg, a couple
+ # minutes
+ continue
+ # first check errors on entire batch...
+ for msg in batch:
+ if msg.error():
+ raise KafkaException(msg.error())
+ # ... then process
+ if self.push_batches:
+ self.counts['total'] += len(batch)
+ records = [json.loads(msg.value().decode('utf-8')) for msg in batch]
+ self.worker.push_batch(records)
+ self.counts['pushed'] += len(batch)
+ print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
+ else:
+ for msg in batch:
+ self.counts['total'] += 1
+ if self.raw_records:
+ # In this mode, pass the Kafka message as bytes through
+ # without decoding as JSON. Eg, for thumbnails (where
+ # message bytes are JPEG, and we need # the sha1hex key
+ # from the message)
+ record = msg.value()
+ else:
+ record = json.loads(msg.value().decode('utf-8'))
+ # This complex bit of code implements backoff/backpressure
+ # in a way that will not cause this Kafka consumer to lose
+ # partition assignments (resulting in a rebalance). This
+ # was needed for the ingest workers. There is probably a
+ # better way to structure this concurrency.
+ done = False
+ while not done:
+ try:
+ # use timeouts; don't want kafka itself to timeout
+ self.worker.push_record_timeout(record, key=msg.key(), timeout=self.process_timeout_sec)
+ break
+ except SandcrawlerBackoffError as be:
+ print("Backing off for 200 seconds: {}".format(be))
+ self.consumer.pause(self.consumer.assignment())
+ for i in range(40):
+ # Beware this poll which should not be
+ # receiving any messages because we are paused!
+ empty_batch = self.consumer.poll(0)
+ assert not empty_batch
+ time.sleep(5)
+ self.consumer.resume(self.consumer.assignment())
+ self.counts['pushed'] += 1
+ if self.counts['total'] % 500 == 0:
+ print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
+ for msg in batch:
+ # locally store offsets of processed messages; will be
+ # auto-commited by librdkafka from this "stored" value
+ self.consumer.store_offsets(message=msg)
+
+ # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
+ # commit the current batch if it has been lingering
+ worker_counts = self.worker.finish()
+ print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
+ self.consumer.close()
+ return self.counts
+
+
+def make_kafka_consumer(hosts, consume_topic, group):
+ topic_name = consume_topic
+
+ def fail_fast(err, partitions):
+ if err is not None:
+ print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+ for p in partitions:
+ # check for partition-specific commit errors
+ if p.error:
+ print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(p.error)
+ #print("Kafka consumer commit successful")
+ pass
+
+ # previously, using pykafka
+ #auto_commit_enable=True,
+ #auto_commit_interval_ms=30000, # 30 seconds
+ conf = {
+ 'bootstrap.servers': hosts,
+ 'group.id': group,
+ 'on_commit': fail_fast,
+ # messages don't have offset marked as stored until processed,
+ # but we do auto-commit stored offsets to broker
+ 'enable.auto.offset.store': False,
+ 'enable.auto.commit': True,
+ # user code timeout; if no poll after this long, assume user code
+ # hung and rebalance (default: 6min)
+ 'max.poll.interval.ms': 360000,
+ 'default.topic.config': {
+ 'auto.offset.reset': 'latest',
+ },
+ }
+
+ def on_rebalance(consumer, partitions):
+ for p in partitions:
+ if p.error:
+ raise KafkaException(p.error)
+ print("Kafka partitions rebalanced: {} / {}".format(
+ consumer, partitions),
+ file=sys.stderr)
+
+ consumer = Consumer(conf)
+ # NOTE: it's actually important that topic_name *not* be bytes (UTF-8
+ # encoded)
+ consumer.subscribe([topic_name],
+ on_assign=on_rebalance,
+ on_revoke=on_rebalance,
+ )
+ print("Consuming from kafka topic {}, group {}".format(topic_name, group), file=sys.stderr)
+ return consumer
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
new file mode 100644
index 0000000..7a0086d
--- /dev/null
+++ b/python/sandcrawler/xml.py
@@ -0,0 +1,7 @@
+
+import xml.etree.ElementTree as ET
+
+
+def xml_reserialize(raw: bytes) -> str:
+ root = ET.fromstring(raw)
+ return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode")
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
new file mode 100755
index 0000000..8e275cf
--- /dev/null
+++ b/python/sandcrawler_worker.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+
+"""
+These are generally for continuously running workers that consume from Kafka.
+Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
+or S3 (SeaweedFS).
+"""
+
+import os
+import sys
+import argparse
+import datetime
+import raven
+
+from sandcrawler import *
+from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+try:
+ git_sha = raven.fetch_git_sha('..')
+except Exception as e:
+ git_sha = None
+sentry_client = raven.Client(release=git_sha)
+
+
+def run_grobid_extract(args):
+ consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env)
+ produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ wayback_client = WaybackClient(
+ host_url=args.grobid_host,
+ )
+ worker = GrobidWorker(
+ grobid_client=grobid_client,
+ wayback_client=wayback_client,
+ sink=sink,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="grobid-extract",
+ batch_size=1,
+ )
+ pusher.run()
+
+def run_pdf_extract(args):
+ consume_topic = "sandcrawler-{}.unextracted".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ wayback_client = WaybackClient(
+ host_url=args.grobid_host,
+ )
+ worker = PdfExtractWorker(
+ wayback_client=wayback_client,
+ sink=pdftext_sink,
+ thumbnail_sink=thumbnail_sink,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="pdf-extract",
+ batch_size=1,
+ push_timeout_sec=120,
+ )
+ pusher.run()
+
+def run_persist_grobid(args):
+ consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ worker = PersistGrobidWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ kafka_group = "persist-grobid"
+ if args.s3_only:
+ kafka_group += "-s3"
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_pdftext(args):
+ consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ worker = PersistPdfTextWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ kafka_group = "persist-pdf-text"
+ if args.s3_only:
+ kafka_group += "-s3"
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_thumbnail(args):
+ consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ worker = PersistThumbnailWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_extension=".180px.jpg",
+ s3_folder="pdf",
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-pdf-thumbnail",
+ push_batches=False,
+ raw_records=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_xml_doc(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.xml-doc"
+ worker = PersistXmlDocWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-xml-doc",
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_html_teixml(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.html-teixml"
+ worker = PersistHtmlTeiXmlWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-html-teixml",
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_pdftrio(args):
+ consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
+ worker = PersistPdfTrioWorker(
+ db_url=args.db_url,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-pdftrio",
+ push_batches=True,
+ batch_size=100,
+ )
+ pusher.run()
+
+def run_ingest_file(args):
+ spn_cdx_retry_sec = 9.0
+ if args.bulk:
+ consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-bulk".format(args.env)
+ elif args.priority:
+ spn_cdx_retry_sec = 45.0
+ consume_group = "sandcrawler-{}-ingest-file-priority".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-priority".format(args.env)
+ else:
+ spn_cdx_retry_sec = 1.0
+ consume_group = "sandcrawler-{}-ingest-file".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
+ produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
+ sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic,
+ )
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
+ worker = IngestFileWorker(
+ grobid_client=grobid_client,
+ sink=sink,
+ grobid_sink=grobid_sink,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ # don't SPNv2 for --bulk backfill
+ try_spn2=not args.bulk,
+ spn_cdx_retry_sec=spn_cdx_retry_sec,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=consume_group,
+ batch_size=1,
+ )
+ pusher.run()
+
+def run_persist_ingest_file(args):
+ consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
+ worker = PersistIngestFileResultWorker(
+ db_url=args.db_url,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-ingest",
+ push_batches=True,
+ batch_size=100,
+ )
+ pusher.run()
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--env',
+ default="dev",
+ help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ parser.add_argument('--grobid-host',
+ default="http://grobid.qa.fatcat.wiki",
+ help="GROBID API host/port")
+ parser.add_argument('--db-url',
+ help="postgresql database connection string",
+ default="postgres:///sandcrawler")
+ parser.add_argument('--s3-url',
+ help="S3 (seaweedfs) backend URL",
+ default="localhost:9000")
+ parser.add_argument('--s3-access-key',
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
+ parser.add_argument('--s3-secret-key',
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))
+ parser.add_argument('--s3-bucket',
+ help="S3 (seaweedfs) bucket to persist into",
+ default="sandcrawler-dev")
+ subparsers = parser.add_subparsers()
+
+ sub_grobid_extract = subparsers.add_parser('grobid-extract',
+ help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka")
+ sub_grobid_extract.set_defaults(func=run_grobid_extract)
+
+ sub_pdf_extract = subparsers.add_parser('pdf-extract',
+ help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka")
+ sub_pdf_extract.set_defaults(func=run_pdf_extract)
+
+ sub_persist_grobid = subparsers.add_parser('persist-grobid',
+ help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")
+ sub_persist_grobid.add_argument('--s3-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_persist_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_grobid.set_defaults(func=run_persist_grobid)
+
+ sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
+ help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")
+ sub_persist_pdftext.add_argument('--s3-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_persist_pdftext.add_argument('--db-only',
+ action='store_true',
+ help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
+
+ sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
+ help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
+ sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
+
+ sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc',
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
+
+ sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml',
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
+
+ sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
+ help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
+ sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
+
+ sub_ingest_file = subparsers.add_parser('ingest-file',
+ help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
+ sub_ingest_file.add_argument('--bulk',
+ action='store_true',
+ help="consume from bulk kafka topic (eg, for ingest backfill)")
+ sub_ingest_file.add_argument('--priority',
+ action='store_true',
+ help="consume from priority kafka topic (eg, for SPN requests)")
+ sub_ingest_file.set_defaults(func=run_ingest_file)
+
+ sub_persist_ingest_file = subparsers.add_parser('persist-ingest-file',
+ help="daemon that consumes ingest-file output from Kafka and pushes to postgres")
+ sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
new file mode 100755
index 0000000..03a1f29
--- /dev/null
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+"""
+This script is intended to be used for backfill ingest of old crawls. It can
+also be used as a fast path for getting freshly crawled content into fatcat if
+the crawl was a hit and the arabesque JSON was exported conservatively.
+
+Run like:
+
+ ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json
+
+Can then run through requests using that tool, or dump into kafka queue.
+"""
+
+import sys
+import json
+import argparse
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ if not row['hit']:
+ continue
+
+ request = {
+ 'base_url': row['final_url'],
+ 'ingest_type': args.ingest_type,
+ 'link_source': args.link_source,
+ 'link_source_id': row['identifier'],
+ 'ingest_request_source': args.ingest_request_source,
+ 'ext_ids': {
+ args.extid_type: row['identifier'],
+ },
+ }
+ if args.release_stage:
+ assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
+ request['release_stage'] = args.release_stage
+
+ print("{}".format(json.dumps(request, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--link-source',
+ required=True,
+ help="link_source to include in request")
+ parser.add_argument('--extid-type',
+ required=True,
+ help="extid to encode identifier as")
+ parser.add_argument('--ingest-type',
+ default="pdf",
+ help="ingest type (pdf, html, xml, etc)")
+ parser.add_argument('--ingest-request-source',
+ default="arabesque",
+ help="to include in request")
+ parser.add_argument('--release-stage',
+ default=None,
+ help="to include in request")
+ parser.add_argument('json_file',
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..e867b21
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import sys
+import shutil
+import tempfile
+import requests
+import subprocess
+import internetarchive as ia
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(
+ ia.search_items(
+ query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item['identifier']
+ # TODO: error handling
+ try:
+ ret = ia.download(item, files=[item + '.cdx.gz'],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000)
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
+ shell=True)
+
+ # Move and cleanup
+ shutil.move('{}/combined.gz'.format(tempdir),
+ '{}.cdx.gz'.format(collection))
+
+ print("Done!")
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
new file mode 100755
index 0000000..33c425d
--- /dev/null
+++ b/python/scripts/covid2ingestrequest.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform_cnki(obj):
+
+ requests = []
+ assert obj['cnki_id']
+
+
+ requests = []
+ requests.append({
+ 'base_url': canon(obj['info_url']),
+ 'ingest_type': 'pdf',
+ 'link_source': 'cnki_covid19',
+ 'link_source_id': obj['cnki_id'],
+ 'ingest_request_source': 'scrape-covid19',
+ })
+ if 'read_url' in obj:
+ requests.append({
+ 'base_url': canon(obj['read_url']),
+ 'ingest_type': 'pdf', # actually HTML
+ 'link_source': 'cnki_covid19',
+ 'link_source_id': obj['cnki_id'],
+ 'ingest_request_source': 'scrape-covid19',
+ })
+
+ return requests
+
+def transform_wanfang(obj):
+
+ assert obj['wanfang_id']
+ return [{
+ 'base_url': canon(obj['url']),
+ 'ingest_type': 'pdf',
+ 'link_source': 'wanfang_covid19',
+ 'link_source_id': obj['wanfang_id'],
+ 'ingest_request_source': 'scrape-covid19',
+ }]
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ if 'wanfang_id' in row:
+ requests = transform_wanfang(row) or []
+ elif 'cnki_id' in row:
+ requests = transform_cnki(row) or []
+ else:
+ continue
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="COVID-19 metadata file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
new file mode 100755
index 0000000..86b3b35
--- /dev/null
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump
+(from HBase) to AWS S3.
+
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1_hex, json (including grobid0:tei_xml)`
+ => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered
+ down (eg, by join by SHA-1) to a specific manifest
+- AWS S3 bucket and prefix
+
+AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+
+Output:
+- errors/stats to stderr
+- log to stdout (redirect to file), prefixed by sha1
+
+Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
+"""
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import boto3
+import raven
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+def b32_hex(s):
+ """copy/pasta from elsewhere"""
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+
+class DeliverDumpGrobidS3():
+
+ def __init__(self, s3_bucket, **kwargs):
+ self.rstore = None
+ self.count = Counter()
+ self.s3_bucket = s3_bucket
+ self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
+ self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
+ self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
+ self.s3 = boto3.resource('s3')
+ self.bucket = self.s3.Bucket(self.s3_bucket)
+
+ def run(self, dump_file):
+ sys.stderr.write("Starting...\n")
+ for line in dump_file:
+ line = line.strip().split('\t')
+ if len(line) != 2:
+ self.count['skip-line'] += 1
+ continue
+ sha1_hex, grobid_json = line[0], line[1]
+ if len(sha1_hex) != 40:
+ sha1_hex = b32_hex(sha1_hex)
+ assert len(sha1_hex) == 40
+ grobid = json.loads(grobid_json)
+ tei_xml = grobid.get('tei_xml')
+ if not tei_xml:
+ print("{}\tskip empty".format(sha1_hex))
+ self.count['skip-empty'] += 1
+ continue
+ tei_xml = tei_xml.encode('utf-8')
+ # upload to AWS S3
+ obj = self.bucket.put_object(
+ Key="{}{}/{}{}".format(
+ self.s3_prefix,
+ sha1_hex[0:4],
+ sha1_hex,
+ self.s3_suffix),
+ Body=tei_xml,
+ StorageClass=self.s3_storage_class,
+ )
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
+ self.count['success-s3'] += 1
+ sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--s3-bucket',
+ required=True,
+ type=str,
+ help='AWS S3 bucket to upload into')
+ parser.add_argument('--s3-prefix',
+ type=str,
+ default="grobid/",
+ help='key prefix for items created in bucket')
+ parser.add_argument('--s3-suffix',
+ type=str,
+ default=".tei.xml",
+ help='file suffix for created objects')
+ parser.add_argument('--s3-storage-class',
+ type=str,
+ default="STANDARD",
+ help='AWS S3 storage class (redundancy) to use')
+ parser.add_argument('dump_file',
+ help="TSV/JSON dump file",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
+ args = parser.parse_args()
+
+ worker = DeliverDumpGrobidS3(**args.__dict__)
+ worker.run(args.dump_file)
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
new file mode 100755
index 0000000..3dcf962
--- /dev/null
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk copying of PDFs (or other files) from GWB to local disk.
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import raven
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+class DeliverGwbDisk:
+
+ def __init__(self, disk_dir, **kwargs):
+ self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.rstore = None
+ self.count = Counter()
+ # /serve/ instead of /download/ doesn't record view count
+ self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.disk_dir = disk_dir
+ self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
+ self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def run(self, manifest_file):
+ sys.stderr.write("Ensuring all 65536 base directories exist...\n")
+ for i in range(256):
+ for j in range(256):
+ fpath = "{}/{}{:02x}/{:02x}".format(
+ self.disk_dir,
+ self.disk_prefix,
+ i,
+ j)
+ os.makedirs(fpath, exist_ok=True)
+ sys.stderr.write("Starting...\n")
+ for line in manifest_file:
+ self.count['total'] += 1
+ line = line.strip().split('\t')
+ if len(line) != 2:
+ self.count['skip-line'] += 1
+ continue
+ sha1_hex, cdx_json = line[0], line[1]
+ assert len(sha1_hex) == 40
+ file_cdx = json.loads(cdx_json)
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(file_cdx['warc'].split('/')) != 2:
+ sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
+ self.count['skip-warc'] += 1
+ continue
+ # fetch from GWB/petabox via HTTP range-request
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ if blob is None and status:
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ self.count['err-petabox-fetch'] += 1
+ continue
+ elif not blob:
+ print("{}\tskip-empty-blob".format(sha1_hex))
+ self.count['skip-empty-blob'] += 1
+ continue
+ # verify sha1
+ if sha1_hex != hashlib.sha1(blob).hexdigest():
+ #assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ print("{}\terror petabox-hash-mismatch".format(sha1_hex))
+ self.count['err-petabox-hash-mismatch'] += 1
+
+ self.count['petabox-ok'] += 1
+ # save to disk
+ fpath = "{}/{}{}/{}/{}{}".format(
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix)
+ with open(fpath, 'wb') as f:
+ f.write(blob)
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
+ self.count['success-disk'] += 1
+ sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--disk-dir',
+ required=True,
+ type=str,
+ help='local base directory to save into')
+ parser.add_argument('--disk-prefix',
+ type=str,
+ default="pdf/",
+ help='directory prefix for items created in bucket')
+ parser.add_argument('--disk-suffix',
+ type=str,
+ default=".pdf",
+ help='file suffix for created files')
+ parser.add_argument('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+ parser.add_argument('manifest_file',
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
+ args = parser.parse_args()
+
+ worker = DeliverGwbDisk(**args.__dict__)
+ worker.run(args.manifest_file)
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
new file mode 100755
index 0000000..39ac000
--- /dev/null
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk copying of PDFs (or other files) from GWB to AWS S3.
+
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1_hex, file:cdx (json)`
+ => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
+- AWS S3 bucket and prefix
+
+AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+
+GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/.
+
+20x threads on a single machine can process about 340k files in 3 hours; that's
+roughly 6 hours per million per host with 32 threads, or 5k files an hour
+(1.6/second) per thread. Two large machines should be able to upload 10 million
+files in about 30 hours.
+
+Output:
+- errors/stats to stderr
+- log to stdout (redirect to file), prefixed by sha1
+
+Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
+- wayback/GWB libraries
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import boto3
+import raven
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+class DeliverGwbS3:
+
+ def __init__(self, s3_bucket, **kwargs):
+ self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.rstore = None
+ self.count = Counter()
+ # /serve/ instead of /download/ doesn't record view count
+ self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.s3_bucket = s3_bucket
+ self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
+ self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
+ self.s3 = boto3.resource('s3')
+ self.bucket = self.s3.Bucket(self.s3_bucket)
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def run(self, manifest_file):
+ sys.stderr.write("Starting...\n")
+ for line in manifest_file:
+ self.count['total'] += 1
+ line = line.strip().split('\t')
+ if len(line) != 2:
+ self.count['skip-line'] += 1
+ continue
+ sha1_hex, cdx_json = line[0], line[1]
+ assert len(sha1_hex) == 40
+ file_cdx = json.loads(cdx_json)
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(file_cdx['warc'].split('/')) != 2:
+ sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
+ self.count['skip-warc'] += 1
+ continue
+ # fetch from GWB/petabox via HTTP range-request
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ if blob is None and status:
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ self.count['err-petabox-fetch'] += 1
+ continue
+ elif not blob:
+ print("{}\tskip-empty-blob".format(sha1_hex))
+ self.count['skip-empty-blob'] += 1
+ continue
+ # verify sha1
+ if sha1_hex != hashlib.sha1(blob).hexdigest():
+ #assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ print("{}\terror petabox-hash-mismatch".format(sha1_hex))
+ self.count['err-petabox-hash-mismatch'] += 1
+
+ self.count['petabox-ok'] += 1
+ # upload to AWS S3
+ obj = self.bucket.put_object(
+ Key="{}{}/{}{}".format(
+ self.s3_prefix,
+ sha1_hex[0:4],
+ sha1_hex,
+ self.s3_suffix),
+ Body=blob)
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
+ self.count['success-s3'] += 1
+ sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--s3-bucket',
+ required=True,
+ type=str,
+ help='AWS S3 bucket to upload into')
+ parser.add_argument('--s3-prefix',
+ type=str,
+ default="pdf/",
+ help='key prefix for items created in bucket')
+ parser.add_argument('--s3-suffix',
+ type=str,
+ default=".pdf",
+ help='file suffix for created objects')
+ parser.add_argument('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+ parser.add_argument('manifest_file',
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
+ args = parser.parse_args()
+
+ worker = DeliverGwbS3(**args.__dict__)
+ worker.run(args.manifest_file)
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..a7214d0
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+from typing import Optional, List
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ #"semanticscholar.org/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+
+ # large publishers/platforms; may remove in the future
+ #"://link.springer.com/",
+ #"://dergipark.gov.tr/",
+ #"frontiersin.org/",
+ #"scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+ "abstract": [],
+ "doc": [],
+ "": ["pdf"],
+
+ "doi": ["pdf"],
+ "url": ["pdf"],
+ "fulltext": ["pdf"],
+ "anySimpleType": ["pdf"],
+
+ "application/pdf": ["pdf"],
+ "html": ["html", "pdf"],
+ "text/html": ["html", "pdf"],
+ "xml": ["xml"],
+}
+
+def canon(s: str) -> str:
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj: dict) -> List[dict]:
+ """
+ Transforms from a single DOAJ object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ doaj_id = obj['id'].lower()
+ assert doaj_id
+
+ bibjson = obj['bibjson']
+ if not bibjson['link']:
+ return []
+
+ requests = []
+
+ doi: Optional[str] = None
+ for ident in (bibjson['identifier'] or []):
+ if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
+ doi = ident['id'].lower()
+
+ for link in (bibjson['link'] or []):
+ if link.get('type') != "fulltext" or not link.get('url'):
+ continue
+ ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+ if not ingest_types:
+ continue
+
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in link['url'].lower():
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(link['url'].strip())
+ except UnicodeEncodeError:
+ continue
+
+ if not base_url or len(base_url) > 1000:
+ continue
+
+ for ingest_type in ingest_types:
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': ingest_type,
+ 'link_source': 'doaj',
+ 'link_source_id': doaj_id,
+ 'ingest_request_source': 'doaj',
+ 'release_stage': 'published',
+ 'rel': 'publisher',
+ 'ext_ids': {
+ 'doi': doi,
+ 'doaj': doaj_id,
+ },
+ 'edit_extra': {},
+ }
+ requests.append(request)
+
+ return requests
+
+def run(args) -> None:
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="DOAJ article dump file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
new file mode 100755
index 0000000..9fe1499
--- /dev/null
+++ b/python/scripts/enrich_scored_matches.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Takes an "joined" TSV input stream:
+
+- sha1
+- dois (JSON list)
+- cdx (JSON object)
+ - url
+ - dt
+ (etc)
+- mimetype
+- size (integer)
+
+And outputs JSON objects that are can be imported into fatcat with the
+"matched" script.
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+import base64
+
+def run():
+ for line in sys.stdin:
+ line = line.split('\t')
+ assert len(line) == 5
+ raw_sha1 = line[0].replace('sha1:', '')
+ dois = json.loads(line[1])
+ cdx = json.loads(line[2])
+ mimetype = line[3]
+ size = int(line[4])
+
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+
+ obj = dict(
+ sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ size=size,
+ mimetype=mimetype)
+ print(json.dumps(obj))
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
new file mode 100755
index 0000000..dc4bea7
--- /dev/null
+++ b/python/scripts/filter_grobid_metadata.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+with open('title_slug_denylist.txt', 'r') as f:
+ TITLE_DENYLIST = [l.strip() for l in f]
+
+TITLE_DENYLIST.extend((
+ 'editorial',
+ 'advertisement',
+ 'bookreviews',
+ 'reviews',
+ 'nr',
+ 'abstractoriginalarticle',
+ 'originalarticle',
+ 'impactfactor',
+ 'articlenumber',
+))
+
+# The full name can't *entirely* be one of these
+NAME_DENYLIST = (
+ 'phd',
+ 'phdstudent',
+)
+
+def tokenize(s, remove_whitespace=True):
+
+ s.replace('&apos;', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+
+assert tokenize("Impact Factor: 2.114") == "impactfactor"
+assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
+def filter_title(title):
+
+ title = title.strip()
+ if len(title) > 500:
+ return None
+ title_slug = tokenize(title, remove_whitespace=True)
+ if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
+ return None
+ if title_slug.startswith('nr'):
+ return None
+ if title.lower().replace('.', '').startswith('int j '):
+ return None
+
+ for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
+ if title.startswith(prefix):
+ title.replace(prefix, '')
+
+ if title.startswith("The Journal of "):
+ return None
+
+ if "volume" in title_slug and "issue" in title_slug:
+ return None
+
+ if "downloadedfrom" in title_slug:
+ return None
+
+ if title_slug.startswith("issn"):
+ return None
+
+ # titles with too many or too few words in title
+ title_words = len(title.split())
+ if title_words > 50 or title_words < 2:
+ return None
+
+ # titles with spaces between every letter (more than N such single-char words)
+ if len([True for w in title.split() if len(w) == 1]) > 12:
+ return None
+
+ # too deep subtitling/splitting
+ if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ return None
+
+ return title
+
+def filter_author_name(name):
+ name = name['name']
+ if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+ return None
+ return ' '.join([t for t in name.split() if tokenize(t)])
+
+def filter_authors(l):
+ return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
+def filter_refs(l):
+ # TODO:
+ return l
+
+def filter_journal_name(name):
+ # same denylist, for now
+ if not name:
+ return None
+ name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ slug_name = tokenize(name)
+ if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
+ return None
+ for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ if name.startswith(prefix):
+ name = name.replace(prefix, '')
+ for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ if name.endswith(suffix):
+ name = name.replace(suffix, '')
+ if "====================" in name:
+ return None
+ if len(name) > 150:
+ return None
+ return ' '.join(name.split())
+
+def filter_metadata(obj):
+ if not (obj.get('title') and obj.get('authors')):
+ return None
+
+ title = filter_title(obj['title'])
+ if not title:
+ #sys.stderr.write("bad title\n")
+ return None
+ else:
+ obj['title'] = title
+ obj['authors'] = filter_authors(obj['authors'])
+ obj['citations'] = filter_refs(obj['citations'])
+ obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+
+ return obj
+
+def run(invert=False):
+ for line in sys.stdin:
+ fields = line.split('\t')
+ if len(fields) == 5:
+ raw = fields[4]
+ elif len(fields) == 1:
+ raw = fields[0]
+ else:
+ sys.stderr.write("bad line\n")
+ continue
+ obj = json.loads(raw)
+ processed = filter_metadata(obj)
+ if processed:
+ if not invert:
+ processed = json.dumps(processed)
+ if len(fields) == 5:
+ fields[4] = processed
+ else:
+ fields[0] = processed
+ print('\t'.join(fields))
+ elif invert:
+ print(raw.strip())
+
+if __name__=="__main__":
+ run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
new file mode 100755
index 0000000..bbba770
--- /dev/null
+++ b/python/scripts/filter_groupworks.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Filters an input stream of sorted "groupworks" scalding job, and outputs
+"good enough" matches to be merged in fatcat.
+
+Output is JSON lines which are arrays of releases that could/should be merged
+together, either as multiple releases under a single work, or releases merged
+into a single entity (via redirects).
+
+Note that releases *should* only end up on a single line, and only once per
+line!
+
+No dependencies (only python3 stdlib)
+
+Note: the actual importer/merger should filter the following patterns out:
+- container title has "letter" and "diar"
+- contribs (authors) contain "&NA;"
+- dates differ (not just year)
+"""
+
+import sys
+import json
+
+# out of 1000
+SCORE_THRESHOLD = 900
+
+MAX_SLUG_LINES = 50
+
+REQUIRE_AUTHORS = False
+
+def tokenize(s, remove_whitespace=False):
+
+ s.replace('&apos;', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').replace(b'?', b'')
+
+def check_authors(left, right):
+ """
+ Intended to check GROBID extracted authors (right) against "known good"
+ (but maybe not perfect) Crossref metadata authors ("left").
+ """
+ if not left and not right:
+ return bool(not REQUIRE_AUTHORS)
+ if len(left) != len(right):
+ return False
+ right_all = tokenize(" ".join(right))
+ for i in range(len(left)):
+ l = left[i].lower().replace('jr.', '').split()
+ if not l:
+ return False
+ l = tokenize(l[-1])
+ if len(l) <= 1:
+ # weird author name (single char)
+ return False
+ if l not in right_all:
+ #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ return False
+ return True
+
+def test_check_authors():
+ assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
+ assert not check_authors([], ['one'])
+ assert check_authors(['one'], ['one'])
+ assert check_authors(['one two'], ['One Two'])
+ assert check_authors(['two'], ['One Two'])
+ assert check_authors(['two'], ['two, one'])
+ assert check_authors(['mago'], ['Mr. Magoo'])
+ assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+ assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
+# Rows are (score, left, right)
+def process_group(rows):
+
+ # first pass reduces size of list and generates a linkage graph
+ filtered = list()
+ for row in rows:
+ score = int(row[0])
+ if score < SCORE_THRESHOLD:
+ continue
+ left = json.loads(row[1])
+ right = json.loads(row[2])
+ # authors must roughly match
+ if not check_authors(left['authors'], right['authors']):
+ continue
+ # years must match (if defined)
+ if left['year'] and right['year'] and left['year'] != right['year']:
+ continue
+ filtered.append((left, right))
+
+ if not filtered:
+ return
+
+ # second pass finds a connected graph and returns that
+ releases = dict()
+ group_ids = set()
+ for row in filtered[1:]:
+ (left, right) = row
+ l_id = left['fatcat_release']
+ r_id = right['fatcat_release']
+ releases[l_id] = left
+ releases[r_id] = right
+ if not group_ids:
+ group_ids.add(l_id)
+ group_ids.add(r_id)
+ continue
+ if l_id in group_ids or r_id in group_ids:
+ group_ids.add(l_id)
+ group_ids.add(r_id)
+ continue
+
+ if not group_ids:
+ return
+
+ print(json.dumps([releases[ident] for ident in group_ids]))
+
+def run():
+
+ last_slug = None
+ lines = []
+
+ # group lines by slug, and process in batches
+ for line in sys.stdin:
+ line = line.strip().split('\t')
+ assert len(line) == 4
+ slug = line[0]
+ if last_slug and slug != last_slug and lines:
+ if len(lines) <= MAX_SLUG_LINES:
+ process_group(lines)
+ lines = []
+ last_slug = slug
+ lines.append(line[1:])
+
+ # catch any remaining
+ if lines:
+ process_group(lines)
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
new file mode 100755
index 0000000..3654b87
--- /dev/null
+++ b/python/scripts/filter_scored_matches.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Filters an input stream of sorted "matchcrossref" scalding job, and outputs
+"good enough" matches to be inserted to fatcat.
+
+Currently works on DOI numbers. Filters for a high enough string match (doesn't
+re-do title match), and checks author lists. Filters out slugs with too many
+matches, and outputs one-line-per-sha1 (aka, file).
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+
+# out of 1000
+score_threshold = 900
+
+max_slug_lines = 10
+
+require_authors = 1
+
+
+def tokenize(s, remove_whitespace=False):
+
+ s.replace('&apos;', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').replace(b'?', b'')
+
+def check_authors(left, right):
+ """
+ Intended to check GROBID extracted authors (right) against "known good"
+ (but maybe not perfect) Crossref metadata authors ("left").
+ """
+ if not left:
+ return False
+ if len(left) > len(right):
+ return False
+ right_all = tokenize(" ".join(right))
+ for i in range(len(left)):
+ l = left[i].lower().replace('jr.', '').split()
+ if not l:
+ return False
+ l = tokenize(l[-1])
+ if len(l) <= 1:
+ # weird author name (single char)
+ return False
+ if l not in right_all:
+ #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ return False
+ return True
+
+def test_check_authors():
+ assert not check_authors([], [])
+ assert not check_authors([], ['one'])
+ assert check_authors(['one'], ['one'])
+ assert check_authors(['one two'], ['One Two'])
+ assert check_authors(['two'], ['One Two'])
+ assert check_authors(['two'], ['two, one'])
+ assert check_authors(['mago'], ['Mr. Magoo'])
+ assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+ assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
+# Rows are (score, grobid, crossref)
+def process_group(rows):
+ if len(rows) > max_slug_lines:
+ return
+ keepers = dict()
+ for row in rows:
+ score = int(row[0])
+ if score < score_threshold:
+ continue
+ grobid = json.loads(row[1])
+ crossref = json.loads(row[2])
+ if not check_authors(crossref['authors'], grobid['authors']):
+ #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ continue
+ else:
+ #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ pass
+ sha1 = grobid['sha1']
+ doi = crossref['doi'].lower()
+ l = keepers.get(sha1, list())
+ l.append(doi)
+ keepers[sha1] = l
+ for sha1, doi_list in keepers.items():
+ print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
+def run():
+
+ last_slug = None
+ lines = []
+
+ # group lines by slug, and process in batches
+ for line in sys.stdin:
+ line = line.strip().split('\t')
+ assert len(line) == 4
+ slug = line[0]
+ if last_slug and slug != last_slug and lines:
+ process_group(lines)
+ lines = []
+ last_slug = slug
+ lines.append(line[1:])
+
+ # catch any remaining
+ if lines:
+ process_group(lines)
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
new file mode 100755
index 0000000..79feac1
--- /dev/null
+++ b/python/scripts/grobid_affiliations.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+"""
+Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
+output, converts the XML to JSON, filters out raw affiliation strings, and
+dumps these as JSON subset.
+
+Run in bulk like:
+
+ ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
+"""
+
+import sys
+import json
+
+from grobid2json import teixml2json
+
+def parse_hbase(line):
+ line = line.split('\t')
+ assert len(line) == 2
+ sha1hex = line[0]
+ obj = json.loads(line[1])
+ tei_xml = obj['tei_xml']
+ return sha1hex, tei_xml
+
+def parse_pg(line):
+ obj = json.loads(line)
+ return obj['sha1hex'], obj['tei_xml']
+
+def run(mode='hbase'):
+ for line in sys.stdin:
+ if mode == 'hbase':
+ sha1hex, tei_xml = parse_hbase(line)
+ elif mode == 'pg':
+ sha1hex, tei_xml = parse_pg(line)
+ else:
+ raise NotImplementedError('parse mode: {}'.format(mode))
+
+ obj = teixml2json(tei_xml, encumbered=False)
+
+ affiliations = []
+ for author in obj['authors']:
+ if author.get('affiliation'):
+ affiliations.append(author['affiliation'])
+ if affiliations:
+ # don't duplicate affiliations; only the unique ones
+ affiliations = list(set([json.dumps(a) for a in affiliations]))
+ affiliations = [json.loads(a) for a in affiliations]
+ print('\t'.join([sha1hex, json.dumps(affiliations)]))
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
new file mode 100755
index 0000000..d01b526
--- /dev/null
+++ b/python/scripts/import_grobid_metadata.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import datetime
+
+MAX_ABSTRACT_BYTES=4096
+
+def parse_grobid_json(obj):
+
+ if not obj.get('title'):
+ return None
+
+ extra = dict()
+
+ if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
+ abobj = dict(
+ mimetype="text/plain",
+ language=None,
+ content=obj.get('abstract').strip())
+ abstracts = [abobj]
+ else:
+ abstracts = None
+
+ contribs = []
+ for a in obj.get('authors', []):
+ c = dict(raw_name=a, role="author")
+ contribs.append(c)
+
+ refs = []
+ for raw in obj.get('citations', []):
+ extra = dict()
+ ref = dict()
+ ref['key'] = raw.get('id')
+ if raw.get('title'):
+ ref['title'] = raw['title'].strip()
+ if raw.get('date'):
+ try:
+ year = int(raw['date'].strip()[:4])
+ ref['year'] = year
+ except:
+ pass
+ for key in ('volume', 'url', 'issue', 'publisher'):
+ if raw.get(key):
+ extra[key] = raw[key].strip()
+ if raw.get('authors'):
+ extra['authors'] = [a['name'] for a in raw['authors']]
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+ ref['extra'] = extra
+ refs.append(ref)
+
+ release_type = "journal-article"
+ release_date = None
+ if obj.get('date'):
+ # TODO: only returns year, ever? how to handle?
+ release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+
+ if obj.get('doi'):
+ extra['doi'] = obj['doi'].lower()
+ if obj['journal'].get('name'):
+ extra['container_name'] = obj['journal']['name']
+
+ extra['is_longtail_oa'] = True
+
+ # TODO: ISSN/eISSN handling? or just journal name lookup?
+
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+
+ return dict(
+ title=obj['title'].strip(),
+ contribs=contribs,
+ publisher=obj['journal'].get('publisher'),
+ volume=obj['journal'].get('volume'),
+ issue=obj['journal'].get('issue'),
+ abstracts=abstracts,
+ release_type=release_type,
+ release_date=release_date,
+ extra=extra)
+
+def run():
+ for line in sys.stdin:
+ obj = json.loads(line)
+ out = parse_grobid_json(obj)
+ if out:
+ print(out)
+
+if __name__=="__main__":
+ run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
new file mode 100755
index 0000000..494ec7a
--- /dev/null
+++ b/python/scripts/ingestrequest_row2json.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+"""
+This script is used to turn ingest request postgres rows (in JSON export
+format) back in to regular ingest request JSON.
+
+The only difference is the name and location of some optional keys.
+"""
+
+import sys
+import json
+import argparse
+
+
+def transform(row):
+ """
+ dict-to-dict
+ """
+ row.pop('created', None)
+ extra = row.pop('request', None) or {}
+ for k in ('ext_ids', 'edit_extra'):
+ if k in extra:
+ row[k] = extra[k]
+ if 'release_ident' in extra:
+ row['fatcat'] = dict(release_ident=extra['release_ident'])
+ return row
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ try:
+ req = transform(json.loads(l))
+ except:
+ print(l, file=sys.stderr)
+ print(json.dumps(req, sort_keys=True))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
new file mode 100755
index 0000000..35cee5b
--- /dev/null
+++ b/python/scripts/manifest_converter.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""
+Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
+"match" JSON objects which can be imported into fatcat with matched_import.py
+
+This was used to convert this manifest:
+
+ https://archive.org/details/ia_papers_manifest_2018-01-25/
+
+to JSON format for fast fatcat importing.
+"""
+
+import sys
+import json
+import sqlite3
+
+# iterate over rows in files metadata...
+# 1. select all identified DOIs
+# => filter based on count
+# 2. select all file metadata
+# 3. output object
+
+def or_none(s):
+ if s is None:
+ return None
+ elif type(s) == str and ((not s) or s == "\\N" or s == "-"):
+ return None
+ return s
+
+def process_db(db_path):
+
+ db = sqlite3.connect(db_path)
+
+ for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
+ sha1 = row[0]
+ dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
+ dois = [d[0] for d in dois]
+ if not dois:
+ continue
+ urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
+ if not urls:
+ continue
+ cdx = [dict(url=row[0], dt=row[1]) for row in urls]
+ obj = dict(
+ sha1=sha1,
+ mimetype=or_none(row[1]),
+ size=(or_none(row[2]) and int(row[2])),
+ md5=or_none(row[3]),
+ dois=dois,
+ cdx=cdx,
+ )
+ dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
+ print(json.dumps(obj))
+
+if __name__=="__main__":
+ process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
new file mode 100755
index 0000000..916f41c
--- /dev/null
+++ b/python/scripts/oai2ingestrequest.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+"""
+Transform an OAI-PMH bulk dump (JSON) into ingest requests.
+
+Eg: https://archive.org/details/oai_harvest_20200215
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ "://127.0.0.1/",
+
+ # OAI specific additions
+ "://hdl.handle.net/",
+]
+
+RELEASE_STAGE_MAP = {
+ 'info:eu-repo/semantics/draftVersion': 'draft',
+ 'info:eu-repo/semantics/submittedVersion': 'submitted',
+ 'info:eu-repo/semantics/acceptedVersion': 'accepted',
+ 'info:eu-repo/semantics/publishedVersion': 'published',
+ 'info:eu-repo/semantics/updatedVersion': 'updated',
+}
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj):
+ """
+ Transforms from a single OAI-PMH object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ return []
+ if not obj.get('urls'):
+ return []
+
+ # look in obj['formats'] for PDF?
+ if obj.get('formats'):
+ # if there is a list of formats, and it does not contain PDF, then
+ # skip. Note that we will continue if there is no formats list.
+ has_pdf = False
+ for f in obj['formats']:
+ if 'pdf' in f.lower():
+ has_pdf = True
+ if not has_pdf:
+ return []
+
+ doi = None
+ if obj.get('doi'):
+ doi = obj['doi'][0].lower().strip()
+ if not doi.startswith('10.'):
+ doi = None
+
+ # infer release stage and/or type from obj['types']
+ release_stage = None
+ for t in obj.get('types', []):
+ if t in RELEASE_STAGE_MAP:
+ release_stage = RELEASE_STAGE_MAP[t]
+
+ # TODO: infer rel somehow? Eg, repository vs. OJS publisher
+ rel = None
+
+ for url in obj['urls']:
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in url:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(url)
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': 'pdf',
+ 'link_source': 'oai',
+ 'link_source_id': obj['oai'].lower(),
+ 'ingest_request_source': 'metha-bulk',
+ 'release_stage': release_stage,
+ 'rel': rel,
+ 'ext_ids': {
+ 'doi': doi,
+ 'oai': obj['oai'].lower(),
+ },
+ 'edit_extra': {},
+ }
+ requests.append(request)
+
+ return requests
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
new file mode 100755
index 0000000..af08db6
--- /dev/null
+++ b/python/scripts/pdf_thumbnail.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""
+Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
+
+Originally used to benchmark and compare file size/quality.
+"""
+
+import sys
+import poppler
+from PIL import Image
+
+
+def run(inpath, outpath):
+
+ try:
+ pdf = poppler.load_from_file(inpath)
+ page = pdf.create_page(0)
+ except Exception as e:
+ print(str(e), file=sys.stderr)
+ sys.exit(0)
+
+ renderer = poppler.PageRenderer()
+ full_page = renderer.render_page(page)
+ img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
+ img.thumbnail((180,300), Image.BICUBIC)
+ #img.thumbnail((360,600), Image.BICUBIC)
+ img.save(outpath)
+ #img.save(outpath, quality=95)
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
+ sys.exit(-1)
+ run(sys.argv[1], sys.argv[2])
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
new file mode 100755
index 0000000..5536e6c
--- /dev/null
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+]
+
+RELEASE_STAGE_MAP = {
+ 'draftVersion': 'draft',
+ 'submittedVersion': 'submitted',
+ 'acceptedVersion': 'accepted',
+ 'publishedVersion': 'published',
+ 'updatedVersion': 'updated',
+}
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj):
+ """
+ Transforms from a single unpaywall object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj['doi'].startswith('10.'):
+ return requests
+ if not obj['oa_locations']:
+ return requests
+
+ for location in obj['oa_locations']:
+ if not location['url_for_pdf']:
+ continue
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in location['url_for_pdf']:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(location['url_for_pdf'])
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': 'pdf',
+ 'link_source': 'unpaywall',
+ 'link_source_id': obj['doi'].lower(),
+ 'ingest_request_source': 'unpaywall',
+ 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
+ 'rel': location['host_type'],
+ 'ext_ids': {
+ 'doi': obj['doi'].lower(),
+ },
+ 'edit_extra': {},
+ }
+ if obj.get('oa_status'):
+ request['edit_extra']['oa_status'] = obj['oa_status']
+ if location.get('evidence'):
+ request['edit_extra']['evidence'] = location['evidence']
+ if location['pmh_id']:
+ request['ext_ids']['pmh_id'] = location['pmh_id']
+ requests.append(request)
+
+ return requests
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="unpaywall dump file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/mapreduce/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml b/python/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
index dbc8be5..dbc8be5 100644
--- a/mapreduce/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
+++ b/python/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
diff --git a/python/tests/files/dlib_05vanhyning.html b/python/tests/files/dlib_05vanhyning.html
new file mode 100644
index 0000000..dbe3ef7
--- /dev/null
+++ b/python/tests/files/dlib_05vanhyning.html
@@ -0,0 +1,350 @@
+<!DOCTYPE html>
+<html lang="en" itemscope itemtype="http://schema.org/Article">
+<head>
+<script type="text/javascript" src="/js/ga.js"></script>
+<style type="text/css">
+
+.topLeft { border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftThick { border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftRight {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftRightThick {border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftBottom {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.all {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+table.plain {border-collapse: separate;
+ border-spacing: 0px;
+ margin-left: auto;
+ margin-right: auto;
+ }
+td.plain {padding: 6px;
+ vertical-align: text-top;
+ }
+
+table.author {border-collapse: separate;
+ border-spacing: 6px;
+ }
+td.authors {padding: 6px;
+ }
+
+li:not(:last-child) {
+ margin-bottom: .5em;
+ }
+
+div.center {margin-left: auto; margin-right: auto;
+ }
+
+</style>
+<meta charset="utf-8" />
+<meta id="DOI" content="10.1045/may2017-vanhyning" />
+<meta itemprop="datePublished" content="2017-05-15" />
+<meta id="description" content="D-Lib Magazine Article" />
+<meta id="keywords" content="Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS" />
+<link href="../../../style/style1.css" rel="stylesheet" type="text/css" />
+
+<title>Transforming Libraries and Archives through Crowdsourcing</title>
+</head>
+
+<body>
+<form action="/cgi-bin/search.cgi" method="get">
+
+<div style="height:2px;background:#2b538e"></div>
+<div style="height:4px;background:#4078b1"></div>
+
+<div style="height:30px;background:#4078b1">
+
+<span style="color: #ffffff; font-size: 12px; float: right; margin-right: 10px;">Search D-Lib:
+<input type="text" id="words" value="" size="25" />
+<input type="submit" id="search" value="Go!" />
+<input type="hidden" id="config" value="htdig" />
+<input type="hidden" id="restrict" value="" />
+<input type="hidden" id="exclude" value="" />
+</span>
+</div>
+
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:1px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:1px;background:#2b538e"></div>
+<div style="height:92px;background:#4078b1"><img width="450" height="90" alt="D-Lib-blocks5" src="../../../img2/D-Lib-blocks5.gif">
+</div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#e04c1e"></div>
+<div style="height:24px;background:#eda443"><img src="../../../img2/magazine5.gif" alt="The Magazine of Digital Library Research" width="830" height="24" /></div>
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:28px;background:#2b538e">
+<div id="navtable">
+<table>
+<tr><td class="navtext"><img src="../../../img2/transparent.gif" alt="" width="20" height="20" /><a href="../../../dlib.html">HOME</a>&nbsp;|&nbsp;<a href="../../../about.html">ABOUT D-LIB</a>&nbsp;|&nbsp;<a href="../../../contents.html" class="navtext">CURRENT ISSUE</a>&nbsp;|&nbsp;<a href="../../../back.html">ARCHIVE</a>&nbsp;|&nbsp;<a href="../../../author-index.html">INDEXES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/groups.html">CALENDAR</a>&nbsp;|&nbsp;<a href="../../author-guidelines.html">AUTHOR GUIDELINES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/mailman/listinfo/dlib-subscribers">SUBSCRIBE</a>&nbsp;|&nbsp;<a href="../../letters.html">CONTACT D-LIB</a></td></tr></table></div></div>
+<div style="height:4px;background:#2b538e"></div>
+<div style="height:1px;background:#e04c1e"></div>
+
+<div style="padding-left: 2.5em; padding-top: 1em;">
+
+<h3 class="blue-space">D-Lib Magazine</h3>
+<p class="blue">May/June 2017<br />
+Volume 23, Number 5/6<br />
+<a href="../05contents.html">Table of Contents</a>
+</p>
+
+<div class="divider-full">&nbsp;</div>
+
+<h3 class="blue-space">Transforming Libraries and Archives through Crowdsourcing</h3>
+
+<p class="blue">Victoria Van Hyning, University of Oxford, Zooniverse<br />
+victoria [at] zooniverse.org<br /><br />
+
+Samantha Blickhan, The Adler Planetarium, Zooniverse<br />
+samantha [at] zooniverse.org<br /><br />
+
+Laura Trouille, The Adler Planetarium, Zooniverse<br />
+trouille [at] zooniverse.org<br /><br />
+
+Chris Lintott, University of Oxford, Zooniverse<br />
+chris [at] zooniverse.org</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p><a href="https://doi.org/10.1045/may2017-vanhyning" class="nolinka">https://doi.org/10.1045/may2017-vanhyning</a></p>
+
+<div class="divider-full">&nbsp;</div>
+ <!-- Abstract or TOC goes here -->
+
+<h3 class="blue">Abstract</h3>
+
+<p class="blue">This article will showcase the aims and research goals of the project entitled "Transforming Libraries and Archives through Crowdsourcing", recipient of a 2016 Institute for Museum and Library Services grant. This grant will be used to fund the creation of four bespoke text and audio transcription projects which will be hosted on the Zooniverse, the world-leading research crowdsourcing platform. These transcription projects, while supporting the research of four separate institutions, will also function as a means to expand and enhance the Zooniverse platform to better support galleries, libraries, archives and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.</p>
+
+<p class="blue">Keywords: Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS</p>
+
+<!-- Article goes next -->
+
+<div class="divider-full">&nbsp;</div>
+<h3>1 Overview<span style="vertical-align: super;"><a href="#n6">1</a></span></h3>
+
+<p>As libraries, museums, and other cultural repositories digitize their collections and place them online, the challenges of transforming these materials into useful and searchable sources of information are becoming increasingly apparent. While OCR and handwriting recognition technology have opened up some print and manuscript corpora, and image and voice recognition software are improving daily, there are still many tasks that require human intervention. For these, volunteer crowdsourcing is a viable and vibrant solution.</p>
+
+<p>The <a href="https://www.zooniverse.org/">Zooniverse</a> is the world-leading research crowdsourcing platform, hosting over 50 active projects and over 100 projects total since its inception in 2007. The projects cover diverse subject areas from astronomy to zoology, engage over 1.5 million registered volunteers, and have produced data used in more than a hundred peer-reviewed articles.<span style="vertical-align: super;"><a href="#n1">2</a></span> The Zooniverse also hosts the <a href="https://www.zooniverse.org/lab">Project Builder</a>, a free platform through which anyone can build their own project. The Zooniverse grew from a single project developed at the University of Oxford in 2007, and is now developed and managed by a team based in Oxford and at the Adler Planetarium in Chicago and the University of Minnesota (see <a href="https://www.zooniverse.org/about/team">Zooniverse Team</a> for a more complete list).</p>
+
+<p>In late 2016, the Institute for Museum and Library Services awarded a National Leadership Grant titled "Transforming Libraries and Archives through Crowdsourcing (LG-71-16-0028-16)" to the Adler Planetarium and its collaborators to support the work of the Zooniverse. Through this grant-funded effort, the Zooniverse will further expand and enhance its platform to better support galleries, libraries, archives, and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.1 What Can Crowdsourcing Offer GLAMs?</h4>
+
+<p>In 2010, author and professor Clay Shirky delivered a rousing <a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">TED</a> talk in which he used the phrase "cognitive surplus" to describe the one trillion hours of leisure time humans collectively accumulate each year (a great deal of which is spent watching television), which could be harnessed to advance human knowledge through civic engagement. He concluded that: "free cultures get what they celebrate. [...If we] celebrate and support and reward the people trying to use cognitive surplus to create civic value [...] we'll be able to change society".[<a href="#1">1</a>] One way that GLAMs can harness this cognitive surplus is through web-based crowdsourcing. What Shirky was describing was a type of "social machine", which Tim Berners-Lee defined as "new form[s] of social processes" emergent from the Web, and involving both human and machine components.[<a href="#2">2</a>] </p>
+
+<p>Academic crowdsourcing invites members of the public to work with specialists to conduct research: for example, to transcribe documents or add metadata to a collection of images, video or audio clips. This data is used in real science, social science, or humanities investigations and should, ideally, lead to publication. Crowdsourcing within GLAMs may not always be oriented around a specific research question or publication, but around making collections more accessible for future research and usability. GLAM crowdsourcing can be the seedbed of future scholarly research.</p>
+
+<p>GLAMs have been engaging volunteers with their collections for well over a century, usually by inviting select individuals into an institution and training them to do work that cannot be done by staff due to time or money constraints. On-site volunteers often build up valuable knowledge and skills and contribute a great deal to their chosen institutions, but training and supervising them also poses challenges. There is a limit to how many volunteers can be trained, supported on site, and indeed attracted and retained in the first place. Online volunteering, enabled by crowdsourcing platforms such as Zooniverse.org, offer an alternative or complementary form of engagement that has many benefits. Online projects can reach a wider range of individuals, including those who are less able-bodied or geographically remote from the institution in which they want to volunteer and/or unable to travel. Such projects require less training and time commitment from volunteers and typically attract a larger number of participants than on-site programs. They also enable GLAMs to open up rare collections to the public without concern for their material safety and security.<span style="vertical-align: super;"><a href="#n2">3</a></span></p>
+
+<p>While crowdsourcing projects have proliferated in the last decade, few offer easy to use, open source, and free platforms on which GLAM academics and amateur users can rely. The Zooniverse has the infrastructure, community, and technical expertise to intervene at this critical stage. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.2 How Does The Zooniverse Work?</h4>
+
+<p>All bespoke Zooniverse projects, including those built on the free Project Builder, have a few core components. Each image, audio or video file (data point) in each project is independently assessed by multiple individuals, whose responses are then aggregated using a variety of algorithms to determine what is in a given image. The amount of required responses for a task to be considered "complete" varies, depending on the project. With relatively quick tasks, such as animal identification in Snapshot Serengeti, upwards of 70 people will see each image. In tasks that require more time, such as transcription projects like <a href="https://www.shakespearesworld.org/#!/">Shakespeare's World</a> and <a href="https://anno.tate.org.uk/#!/">AnnoTate</a>, at least three people transcribe each line on each page. If enough people transcribe the same line and our algorithms deem the line to be completed to a good enough standard, these are greyed out, while outstanding lines are available to future site visitors. This approach was designed along the same principles that underpin all other Zooniverse projects, in which it is assumed that volunteers should work independently on tasks, in order that no one individual should have undue influence over others in the crowd. In the current IMLS project, however, we will test whether allowing volunteers to transcribe and work collaboratively ultimately creates better data and/or better user experiences. We will be able to compare datasets from AnnoTate and Shakespeare's World with text transcription datasets from the two new bespoke text transcription projects and, hopefully, with datasets generated at other institutions that have online crowdsourcing projects. Zooniverse is in a unique position in being able to gather these two very different kinds of data and compare them in order to determine the best outcomes. These findings will ultimately drive our design of free tools on the Project Builder.
+
+<p>In addition to participating in the classification task, users have the opportunity to communicate with other volunteers through an active, object-oriented discussion forum, called "Talk", associated with each project. Here volunteers can ask questions, interact with researchers and fellow volunteers, create their own "collections", and use hashtags to group together posts or images of interest. An example of the latter is <a href="https://talk.sciencegossip.org/#/search?tags%5Bfemale%5D=true">#female</a> from the <a href="https://www.sciencegossip.org/">Science Gossip</a> project, which indicates female authors, illustrators and printers contributing to the main scientific journals in the nineteenth century (visit the <a href="https://talk.sciencegossip.org/#/boards/BSC0000004/discussions/DSC00004s8">Science Gossip Talk</a> board to view the discussion around this tag). These interactions provide a rich set of experiences that allow users to personally experience the community in which they are participating, beyond simply providing classifications. Additionally, the collections allow volunteers to create their own research focal points within existing projects. During the process of transcribing, users can save images that contain content that is pertinent to their research interests by adding them to a public collection. They can then use the Talk forum to publicize their search, allowing other users to add images to that collection as well. In this way, the volunteer base can be mobilized to help other volunteers with minimal effort required.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>2 IMLS Funded Effort: Approach and Focus</h3>
+
+<p>Through the IMLS grant, the Zooniverse will engage in a research and development program to identify and implement crowdsourcing best practices in the arenas of text and audio transcription for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read. Though to date the majority of Zooniverse projects have been based in STEM fields rather than in the humanities, several text transcription projects have already been hosted on the site. For example, the first Zooniverse humanities project was <a href="https://www.ancientlives.org/">Ancient Lives</a>, which invited volunteers to transcribe ancient papyri one letter at a time using a clickable keyboard on their screen: volunteers did not have to be fluent in ancient Greek, they only needed to character match. Over 250,000 volunteers participated in the project, and made more than 1.5 million transcriptions between 2011 and 2014.[<a href="#6">3</a>] Furthermore, the computational pipeline used to convert individual identified letters into consensus-based transcriptions will benefit future classification projects attempting consensus letter or line sequence identifications.[<a href="#7">4</a>]</p>
+
+<p>By 2018 we will build four bespoke projects, two projects for text transcription and two projects for audio transcription, identified through open calls, in order to test, iterate, and research the efficacy of new and existing approaches (including within current Zooniverse and other projects) in these arenas. We will also develop the foundation for a GLAM-friendly data pipeline to export data from a Zooniverse project into GLAM collections. These functionalities are among those most frequently requested by GLAM institutions. We will work closely with four different GLAM institutions to build these bespoke crowdsourcing projects and functionalities. The text transcription open call closed in February 2017, with thirty-one submissions. The audio transcription open call will occur in fall 2017 (see <a href="http://zooniverse.org/get-involved/call-for-projects">Call for Projects</a>).</p>
+
+<p>From the lessons learned in building these bespoke projects, we will explore adding new tools and functionality to the Project Builder, which is freely available to any institution or user who wishes to lead a project. It is a flexible, powerful, and easy-to-use resource for building crowdsourcing projects, with a wide range of potential applications for GLAM collections, including text transcription. A basic text transcription tool is currently available, but will be refined through this grant effort. The Zooniverse has previously used this model of building bespoke projects in order to learn which tools are most useful, before implementing these tools in the Project Builder. We recognize that volunteers' time is precious, and are therefore unwilling to waste it with tools that are not proven to extract data in an efficient, high quality, and useful form. We will also draw on lessons learned from previous experiences supporting transcription projects through Zooniverse and other platforms. For example, <a href="https://www.operationwardiary.org/">Operation War Diary</a> which launched in 2014 to commemorate the outbreak of the First World War, is a partnership between the National Archives (UK), the Imperial War Museum, and the Zooniverse, which invites users to tag and transcribe dates, times, places, and names found in British WWI field diaries. Historian Richard Grayson has used the data to penetrate more deeply than ever before into records of soldiers' daily lives on the front.[<a href="#8">5</a>] All of the Operation War Diary metadata will eventually be integrated into the National Archive catalogues. The process of integrating new metadata into an existing catalogue can be complicated, raising an important question for any GLAM specialist seeking to harness crowdsourcing at their institution. For instance, it is essential to ensure, before starting a project, that the current content management system (CMS) supports the storage of additional metadata, such as large amounts of free-text. If not, it then becomes necessary to use an external resource to make available the results from the crowdsourcing project. Zooniverse can and will do more to facilitate GLAMs and research groups to use and store their data.</p>
+
+<p>Over the course of the IMLS project, we will also address the following research questions:</p>
+
+<p class="indentLeft">Q1: How can crowdsourcing be deployed in the arenas of text and audio transcription and metadata extraction for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read? What methods produce the best data and make for the best user experience?</p>
+
+<p class="indentLeft">Q2: Does the current Zooniverse methodology of multiple independent transcribers and aggregation render better results than allowing volunteers to see previous transcriptions by others or indeed collaborate to create a single transcription? How does each methodology impact the quality of data, as well as depth of analysis and participation?</p>
+
+<p class="indentLeft">Q3: How can we extend our crowdsourcing expertise to more GLAM professionals and learn from them, in turn, how to adjust the Zooniverse platform to best meet their research and curatorial needs?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.1 Addressing Q1 (Crowdsourcing for GLAM)</h4>
+
+<p>Only a platform like the Zooniverse can systematically address a question such as Q1: the community that has developed within the platform is made up of volunteers who move across projects, allowing us to trace the impact of differences between projects on the same volunteers. Zooniverse also has the infrastructure to implement A/B split experiments within a single project. This allows us to develop projects incorporating different practices which are specifically aimed at understanding different methodologies. Through the bespoke text and audio transcription projects, we will expand on the lessons learned through current Zooniverse text transcription projects, including Ancient Lives, AnnoTate, Old Weather, Measuring the ANZACs, Shakespeare's World, Science Gossip, Decoding the Civil War, Orchid Observers and Operation War Diary, as well as from external text transcription projects including <a href="http://blogs.ucl.ac.uk/transcribe-bentham/">Transcribe Bentham</a>, <a href="http://fromthepage.com/">FromthePage</a>, and <a href="http://scripto.org/">Scripto</a>. </p>
+
+<p>In the bespoke projects created through the IMLS grant, the features optimizing volunteer engagement and retention will include: </p>
+
+<ul>
+ <li><i>Volunteer choice:</i> volunteers choose which document to transcribe and can transcribe as little as a single line or as much as an entire document. We have found through AnnoTate and Shakespeare's World that allowing users to transcribe smaller fragments of text (without being required to complete an entire page) mitigates against forced or uncertain readings. We hypothesize and plan to fully test whether allowing microtasking helps to retain volunteers, giving them the chance to build up their skills and not make forced readings. </li>
+
+ <li><i>Keeping the task simple:</i> in Shakespeare's World and AnnoTate, volunteers drop points at the start and end of individual lines of text (not grammatical sentences) and transcribe the text contained between these two points. They do not use XML markup itself, which has proven to be a major repellent to participants in other text transcription crowdsourcing projects.<span style="vertical-align: super;"><a href="#n3">4</a></span> Instead, volunteers highlight words within the transcribed line and choose among different features (e.g., insertion, deletion, expansion, etc.). We propose to use these tagged words in each line to create simple TEI markup on the back-end, for output into commonly used CMSs such as Drupal and Omeka.</li>
+
+ <li><i>Narrowing the content focus to support sense-making:</i> In Shakespeare's World, the first release (or "chapter") consists of recipes and letters, with more genres to follow. This type of structured approach will be applied to the bespoke projects, as this supports creation of narratives within diverse collections, which in turn enables subject experts to more easily foster, and volunteers to contribute to, discussions in Talk.</li>
+</ul>
+
+<p>Features optimizing best practice in regard to data production and management will include:</p>
+
+<ul>
+ <li><i>Reliable, Scalable, Open Source Code Infrastructure:</i> The foundation for the Zooniverse platform that includes the Project Builder is an application written in Ruby on Rails which supports a powerful Application Programming Interface (API). The API serves subjects &#151; images, video or audio &#151; for classification by volunteers via a workflow defined by the project, and receives and records these classifications into a database. The frontend Javascript web software presents user interfaces to volunteers and supports the Project Builder. All Zooniverse code is open source and available through <a href="github.com/zooniverse">Github</a>.</li>
+
+ <li><i>Data Ingestion into Zooniverse:</i> In the current Project Builder, research teams can upload batches of 500 to 1000 subjects (images, videos, or audio clips) at a time by simply dragging and dropping the files. For larger collections and for bespoke projects, typically the research team provides a hard drive and the Zooniverse team uploads the subjects to the API. Through the projects proposed here, we will create a system to better support direct ingestion of large subject sets through a user-friendly web interface, adding functionality to the foundation we already have in place within the Project Builder.</li>
+
+ <li><i>Useful Output for Curation:</i> The Smithsonian Transcription Center is regularly cited as being successful in regard to their output being easily ingestible by CMSs.[<a href="#9">6</a>] Current Zooniverse transcription projects are not set up with this functionality. Currently, through our Project Builder for image annotation/marking projects, research teams can download the raw classification results (i.e. all classifications by all volunteers) as well as automatically-generated aggregated results that include confidence measures on consensus. Through this IMLS-funded effort, we will work with Meghan Ferriter of the Smithsonian Transcription Center, who is on our board of advisors, to design data outputs for full text transcription and full audio transcription that are suitable for ingestion into different GLAM CMSs. A key aspect of this effort is to continue exploring best practices and approaches for transcription aggregation and confidence metrics, building on our efforts with AnnoTate, Shakespeare's World, etc.</li>
+</ul>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.2 Addressing Research Q2 (Independent vs. Collaborative Transcription)</h4>
+
+<p>Through the two bespoke text transcription projects, we will investigate the impact on transcription quality and volunteer experience when volunteers transcribe in isolation versus with knowledge of how others have transcribed the same document. </p>
+
+<p>In terms of measuring impact on transcription quality, we will compare the rate of accuracy for individuals who transcribe in isolation on projects such as AnnoTate and Shakespeare's World versus individuals who see previous transcriptions. We will also compare the rate of accuracy in aggregated results for lines transcribed only by those working in isolation versus for lines in which all but the first transcriber sees previous transcriptions. In order to measure impact on volunteer experience, we will analyze the user behavior statistics we gather, e.g., number of transcriptions completed in a given session, length of session, number of sessions overall, sentiment analysis of discussion forum comments, etc.</p>
+
+<p>There are numerous open questions in this experiment: Does knowledge of other individuals' or collective transcriptions lead individuals down the wrong path? Is transcription more or less accurate if people work in isolation or with an awareness of other people's work? Does making transcriptions visible increase retention as a result of highlighting that an individual's effort is part of a broader community effort or have the opposite effect? What environment best promotes skills acquisition, i.e. improved paleography?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.3 Addressing Research Q3 (Feedback/Training)</h4>
+
+<p>We will provide numerous opportunities for input and feedback from and training for the GLAM community, specifically by working closely with our advisory board and four GLAM project partners throughout. In 2018 we will host feedback sessions at GLAM conferences and summer schools targeting GLAM institutions with collections for which text transcription, audio transcription, or image annotation/marking are of interest (we will include image annotation/marking because those tools are already included via the Project Builder). This will allow for input from a broader set of institutions on our decisions and approach for building new functionality into the Project Builder. In 2018&#151;2019 we will host training workshops for GLAM professionals in using the Project Builder to build their own crowdsourcing projects, incorporate the results into their databases and research, and sustain and nurture their online volunteer communities.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>3 Future Steps: Community Engagement, Output &amp; How to Get Involved</h3>
+
+<p>The IMLS-Funded Project "Transforming Libraries and Archives through Crowdsourcing" is still in its beginning stages. Currently, we are in the process of selecting the first two bespoke crowdsourcing text transcription projects to be built and incorporated into the Zooniverse platform. The detail of our research questions will evolve alongside these new transcription projects, and during the research and development process we will use conference presentations and feedback sessions to gather input which can then guide the overall project design. The open call for the two bespoke audio transcription projects will occur in the fall of 2017. At this point, the bespoke text transcriptions will be in beta review, allowing us to take advantage of lessons learned through that first round of new projects. We believe that this self-reflexive method will simultaneously benefit our ongoing project while offering new tools and ideas to the larger GLAM and academic community.</p>
+
+<p>We anticipate this proposed effort will produce two peer-reviewed publications. One article will focus on the methodology for creating, processing, and evaluating the data produced by the new projects. The second will focus on the results of our research exploring the impact of individual versus collaborative text transcription. We also note that all Zooniverse <a href="github.com/zooniverse">code</a> is freely available under a liberal open source license which serves as an additional or parallel form of publication.</p>
+
+<p>GLAM organizations keen to develop their own crowdsourcing projects should explore the available documentation on <a href="https://www.zooniverse.org/lab-how-to">how to build a project</a> and <a href="https://www.zooniverse.org/lab-best-practices/great-project">best practices for the design, launch and long term phases of a project</a>. While building a project is easy and requires relatively little technical support from Zooniverse or your institution, make sure you have the time to work with your resulting data, and time to support your online volunteer commmunity. Advertising the project's existence should be a long-term task, to avoid a plateau or potential drop-off of user participation. For example, Shakespeare's World received a bump in the number of daily classifications after an article was published in The New Yorker in January of 2017, over a year after the project's launch date.[<a href="#10">7</a>] However, it does not suffice to merely advertise the existence of a project; researchers need to engage with their users on a regular basis.<span style="vertical-align: super;"><a href="#n5">5</a></span> Zooniverse's Talk platform, social media such as blogging, Twitter, Instagram, and indeed in-person or on-site events all provide important channels for engaging current or potential volunteers with your collections. We believe that GLAM organizations, with their long history of volunteer engagement, have many of the skills to work effectively with online volunteers, and will benefit in new ways through cooperation with the crowd.</p>
+
+<p>In conclusion, while this project is specifically focused on text and audio transcription, it is our hope that the results, including the new Project Builder tools and GLAM data pipeline, will ultimately be used across a variety of disciplines and domains. We hope to facilitate future partnerships between GLAM institutions and volunteer communities around the world, thus extending the aims and outcomes of the National Digital Platform funded through this generous IMLS grant into an international digital platform that will benefit many individuals and institutions. </p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>Notes</h3>
+
+<table style="width:90%">
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n6">1</a></td>
+<td style="padding-top: .5em;">Part of this article appeared previously as a blog post for CILIP, The Library and Information Association. Material is reproduced by express permission of CILIP.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n1">2</a></td>
+<td style="padding-top: .5em;">For a partial list of publications, please visit <a href="https://www.zooniverse.org/about/publications">https://www.zooniverse.org/about/publications</a>. </td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n2">3</a></td>
+<td style="padding-top: .5em;">Further discussion of the use of crowdsourcing in GLAM contexts can be found in Melissa Terras, "Crowdsourcing in the Digital Humanities", in <i>A New Companion to Digital Humanities</i>, eds. Susan Schreibman, Ray Siemens, and John Unsworth (John Wiley &amp; Sons, 2016), 420-438, particularly in the section entitled "The Growth of Crowdsourcing in Cultural and Heritage Applications" (pp. 423-28). See also <i>Crowdsourcing Our Cultural Heritage</i>, ed. Mia Ridge (Ashgate, 2014).</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n3">4</a></td>
+<td style="padding-top: .5em;">Causer and Terras, "Many Hands Make Light Work", p. 81: "It would be fair to say that for volunteers, the XML mark-up complicates participation, and it has undoubtedly dissuaded many from participating more fully, or at all." For opinions from the volunteers about the process, the authors additionally refer the reader to Causer and Valerie Wallace, "<a href="http://www.digitalhumanities.org/dhq/vol/6/2/000125/000125.html">Building a Volunteer Community: Results and Findings from Transcribe Bentham</a>", <i>Digital Humanities Quarterly</i> 6.2 (2012).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n5">5</a></td>
+<td style="padding-top: .5em;">Or, as Zephyr Frank, <i>et al</i>. put it: "Paid advertising can generate large numbers of clicks on a website. It cannot, however, produce good metadata or newly uploaded material that is relevant to the scholarly questions posed by academic researchers." "<a href="https://github.com/cestastanford/crowdsourcing/raw/master/files/Mellon%20White%20Paper.pdf">Crowdsourcing for Humanities Research</a>" (2016) Project White Paper. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>References</h3>
+
+<table style="width:90%">
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="1">[1]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Clay Shirky, "<a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">How Cognitive Surplus Will Change the World</a>", June 2010.</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="2">[2]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Tim Berners-Lee with Mark Fischetti, <i>Weaving the Web: The Original Design and Ultimate Destiny of the World Wide Web by its Inventor</i> (San Francisco: Harper, 1999).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="6">[3]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">"P.Oxy 5156, Plutarch Moralia 660C, 661B-C (Quaestiones Convivales IV PR., 1.2)", in <i>The Oxyrhynchus Papyri</i>, R.-L. Chang <i>et al</i>., eds, vol. 78 (London, Egypt Exploration Society, 2012), 97-98. </td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="7">[4]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Alex C. Williams <i>et al.</i>, "A Computational Pipeline for Crowdsourced Transcriptions of Ancient Greek Papyrus Fragments", in <i>IEEE International Conference on Big Data</i>, October 2014. <a href="https://doi.org/10.1109/BigData.2014.7004460">https://doi.org/10.1109/BigData.2014.7004460</a></td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="8">[5]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Richard Grayson, "A Life in the Trenches? The Use of Operation War Diary and Crowdsourcing Methods to Provide an Understanding of the British Army's Day-to-Day Life on the Western Front", <i>British Journal for Military History,</i> 2.2 (2016), 160-85.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="9">[6]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Katie Mika, "<a href="http://library.mcz.harvard.edu/blog/transcription-tools-survey-katie-mika-ndsr-resident">Transcription Tools: a survey by Katie Mika, NDSR Resident</a>", Harvard University, Ernst Mayr Library Blog.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="10">[7]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Roberta Kwok, "<a href="http://www.newyorker.com/tech/elements/crowdsourcing-for-shakespeare">Crowdsourcing For Shakespeare</a>", <i>The New Yorker</i>, 16 Jan. 2017. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>About the Authors</h3>
+
+<p class="blue"><b>Victoria Van Hyning</b> is a Junior Research Fellow at Pembroke College, and a British Academy Postdoctoral Fellow. Her current project, 'Court to Convent: Early Modern English Catholic Women's Autobiography', will reveal how Catholic women articulated selfhood in the period when it was illegal to practice Catholicism, 1535 to 1829. She is also the Humanities PI of Zooniverse.org, the world leading academic crowdsourcing organization. Her projects include <a href="https://www.sciencegossip.org">Science Gossip</a>, <a href="http://www.shakespearesworld.org">Shakespeare's World</a> and <a href="https://anno.tate.org.uk">AnnoTate</a>.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Samantha Blickhan</b> is the IMLS Postdoctoral Fellow in the Department of Citizen Science at the Adler Planetarium, working on transcription projects for the Zooniverse. She received her Ph.D. in Musicology from Royal Holloway, University of London, with a thesis on the palaeography of British song notation in the 12th and 13th centuries. Her research interests include music and perception, and their relationships with writing systems, technology and pedagogy.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Laura Trouille</b> is co-Investigator for Zooniverse and Director of Citizen Science at the Adler Planetarium where she leads the Zooniverse web development and Teen Programs teams. While earning her Ph.D. in astronomy in 2010 studying galaxy evolution, she also earned the Center for the Integration of Research, Teaching and Learning's Delta certificate for STEM education research. As a CIERA Postdoctoral Fellow at Northwestern University's CIERA Center for Astrophysics, she continued her research on active galaxies as well as co-led the Computational Thinking in STEM project, bringing computational thinking and modeling curricular materials to high school science and math teachers. </p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue">Chris Lintott is a professor of astrophysics at the University of Oxford, where he is also a research fellow at New College. He is the principle investigator for Galaxy Zoo and the Zooniverse, and his own research focuses on novel modes of crowdsourcing for anomaly detection.</p>
+
+<div class="divider-full">&nbsp;</div>
+
+ <!-- Standard Copyright line here -->
+
+<div class="center">
+<p class="footer">Copyright &reg; 2017 Victoria Van Hyning, Samantha Blickhan, Laura Trouille and Chris Lintott</p>
+</div>
+
+<div style="height:1px;background:#2b538e"></div>
+
+</div>
+</form>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/dummy.pdf b/python/tests/files/dummy.pdf
new file mode 100644
index 0000000..774c2ea
--- /dev/null
+++ b/python/tests/files/dummy.pdf
Binary files differ
diff --git a/python/tests/files/dummy_zip.zip b/python/tests/files/dummy_zip.zip
new file mode 100644
index 0000000..cb72dc8
--- /dev/null
+++ b/python/tests/files/dummy_zip.zip
Binary files differ
diff --git a/python/tests/files/elife_article.html b/python/tests/files/elife_article.html
new file mode 100644
index 0000000..7aa1361
--- /dev/null
+++ b/python/tests/files/elife_article.html
@@ -0,0 +1,3094 @@
+<!doctype html>
+
+<html lang="en" prefix="og: http://ogp.me/ns#">
+
+<head>
+
+ <meta charset="utf-8">
+
+ <title>Parallel visual circuitry in a basal chordate | eLife</title>
+
+ <meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no">
+
+ <meta name="format-detection" content="telephone=no">
+
+
+ <link rel="apple-touch-icon" sizes="57x57" href="/assets/favicons/apple-touch-icon-57x57.4aeffd56.png">
+ <link rel="apple-touch-icon" sizes="60x60" href="/assets/favicons/apple-touch-icon-60x60.91474092.png">
+ <link rel="apple-touch-icon" sizes="72x72" href="/assets/favicons/apple-touch-icon-72x72.95fa9e7b.png">
+ <link rel="apple-touch-icon" sizes="76x76" href="/assets/favicons/apple-touch-icon-76x76.a4c54393.png">
+ <link rel="apple-touch-icon" sizes="114x114" href="/assets/favicons/apple-touch-icon-114x114.a8199d6e.png">
+ <link rel="apple-touch-icon" sizes="120x120" href="/assets/favicons/apple-touch-icon-120x120.efde6c5c.png">
+ <link rel="apple-touch-icon" sizes="144x144" href="/assets/favicons/apple-touch-icon-144x144.457f5c5e.png">
+ <link rel="apple-touch-icon" sizes="152x152" href="/assets/favicons/apple-touch-icon-152x152.5aea1932.png">
+ <link rel="apple-touch-icon" sizes="180x180" href="/assets/favicons/apple-touch-icon-180x180.21337439.png">
+ <link rel="icon" type="image/svg+xml" href="/assets/favicons/favicon.ee498e7d.svg">
+ <link rel="icon" type="image/png" sizes="32x32" href="/assets/favicons/favicon-32x32.825ee0ea.png">
+ <link rel="icon" type="image/png" sizes="192x192" href="/assets/favicons/android-chrome-192x192.365fe68b.png">
+ <link rel="icon" type="image/png" sizes="16x16" href="/assets/favicons/favicon-16x16.337f389b.png">
+ <link rel="shortcut icon" href="/assets/favicons/favicon.a755add0.ico">
+ <link rel="manifest" href="/assets/favicons/manifest.cff74b51.json">
+ <meta name="theme-color" content="#ffffff">
+ <meta name="application-name" content="eLife">
+
+
+
+
+
+
+ <meta name="dc.format" content="text/html">
+ <meta name="dc.language" content="en">
+ <meta name="dc.publisher" content="eLife Sciences Publications Limited">
+
+ <meta name="dc.title" content="Parallel visual circuitry in a basal chordate">
+
+ <meta name="dc.identifier" content="doi:10.7554/eLife.44753">
+
+ <meta name="dc.date" content="2019-04-18">
+
+ <meta name="dc.rights" content="© 2019 Kourakis et al.. This article is distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use and redistribution provided that the original author and source are credited.">
+
+
+
+ <meta name="dc.contributor" content="Matthew J Kourakis">
+ <meta name="dc.contributor" content="Cezar Borba">
+ <meta name="dc.contributor" content="Angela Zhang">
+ <meta name="dc.contributor" content="Erin Newman-Smith">
+ <meta name="dc.contributor" content="Priscilla Salas">
+ <meta name="dc.contributor" content="B Manjunath">
+ <meta name="dc.contributor" content="William C Smith">
+
+
+
+ <meta property="og:site_name" content="eLife">
+ <meta property="og:url" content="https://elifesciences.org/articles/44753">
+ <meta property="og:title" content="Parallel visual circuitry in a basal chordate">
+ <meta name="twitter:site" content="@eLife">
+
+ <meta property="og:description" content="The ascidian Ciona integrates visual information from two photoreceptor types through convergent excitatory and disinhibitory circuits, thereby evoking swim behaviors.">
+ <meta name="description" content="The ascidian Ciona integrates visual information from two photoreceptor types through convergent excitatory and disinhibitory circuits, thereby evoking swim behaviors.">
+
+ <meta name="twitter:card" content="summary">
+
+ <meta property="og:type" content="article">
+
+ <link rel="canonical" href="/articles/44753">
+
+
+
+
+
+
+
+
+
+ <!--[if lt IE 9]>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv.min.js"></script>
+ <![endif]-->
+
+ <script>
+ window.gtmDataLayer = window.gtmDataLayer || [];
+
+ window.gtmDataLayer.push(
+ {
+ 'articleSubjects': 'Neuroscience',
+ 'articleType': 'Research Article',
+ 'articlePublishDate': 'Apr 18, 2019'
+ }
+ );
+
+ (function (w, d, s, l, i) {
+ w[l] = w[l] || [];
+ w[l].push({
+ 'gtm.start': new Date().getTime(), event: 'gtm.js'
+ });
+ var f = d.getElementsByTagName(s)[0],
+ j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
+ j.async = true;
+ j.src =
+ 'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
+ f.parentNode.insertBefore(j, f);
+ })(window, document, 'script', 'gtmDataLayer', 'GTM-WVM8KG');
+ </script>
+
+
+</head>
+
+<body>
+
+ <noscript>
+ <iframe src="https://www.googletagmanager.com/ns.html?id=GTM-WVM8KG" height="0" width="0"
+ style="display:none; visibility:hidden"></iframe>
+ </noscript>
+
+ <div class="global-wrapper" data-behaviour=" CookieOverlay FragmentHandler Math HypothesisLoader"
+ data-item-type="research-article"
+ >
+
+ <div class="global-inner">
+
+ <div class="wrapper wrapper--site-header">
+ <header class="site-header clearfix" data-behaviour="SiteHeader" id="siteHeader">
+ <div class="site-header__title clearfix" role="banner">
+ <div class="site-header__skip_to_content">
+ <a href="#maincontent" class="site-header__skip_to_content__link button button--default">Skip to Content</a>
+ </div>
+ <a href="/" class="site-header__logo_link">
+ <picture class="site-header__logo_link_image">
+ <source srcset="/assets/patterns/img/patterns/organisms/elife-logo-full.b1283c9a.svg" type="image/svg+xml" media="(min-width: 45.625em)">
+ <source srcset="/assets/patterns/img/patterns/organisms/elife-logo-symbol.6f18db13.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/patterns/organisms/elife-logo-full-1x.ce3f6342.png" alt="eLife logo" class="site-header__logo_link"/>
+ </picture>
+ <span class="visuallyhidden" >eLife home page</span>
+ </a>
+ </div>
+ <div class="site-header__navigation" role="navigation" aria-label="Main navigation">
+
+ <nav class="nav-secondary">
+ <ul class="nav-secondary__list clearfix">
+ <li class="nav-secondary__item nav-secondary__item--first">
+
+
+
+
+ <a href="/about">
+
+ About
+ </a>
+
+
+ </li>
+ <li class="nav-secondary__item">
+
+
+
+
+ <a href="/community">
+
+ Community
+ </a>
+
+
+ </li>
+ <li class="nav-secondary__item nav-secondary__item--hide-narrow">
+
+
+
+ <a href="https://reviewer.elifesciences.org/login" class="button button--extra-small button--default" id="submitResearchButton">Submit my research</a>
+
+
+
+ </li>
+ <li class="nav-secondary__item nav-secondary__item--last">
+
+
+ <div class="login-control"
+
+
+ data-behaviour="LoginControl">
+
+
+ <a href="/log-in" class="button button--login" >Log in/Register<span class="visuallyhidden"> (via ORCID - An ORCID is a persistent digital identifier for researchers)</span></a>
+
+ </div>
+
+
+ </li>
+ </ul>
+ </nav>
+
+ <nav class="nav-primary">
+ <ul class="nav-primary__list clearfix">
+ <li class="nav-primary__item nav-primary__item--first">
+
+
+
+
+ <a href="#mainMenu">
+ <picture class="nav-primary__menu_icon">
+
+
+
+ <source srcset="/assets/patterns/img/patterns/molecules/nav-primary-menu-ic.ac4e582f.svg"
+ type="image/svg+xml"
+ >
+
+
+
+ <img srcset="/assets/patterns/img/patterns/molecules/nav-primary-menu-ic_2x.8722f6c7.png 2x, /assets/patterns/img/patterns/molecules/nav-primary-menu-ic_1x.8efd68cc.png 1x"
+ src="/assets/patterns/img/patterns/molecules/nav-primary-menu-ic_1x.8efd68cc.png"
+
+ alt="" />
+
+
+ </picture>
+
+
+ <span class="visuallyhidden nav-primary__menu_text"> Menu </span>
+
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item">
+
+
+
+
+ <a href="/">
+
+ Home
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item">
+
+
+
+
+ <a href="/magazine">
+
+ Magazine
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item">
+
+
+
+
+ <a href="/labs">
+
+ Innovation
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item nav-primary__item--last nav-primary__item--search">
+
+
+
+
+ <a href="/search" rel="search">
+ <picture class="nav-primary__search_icon">
+
+
+
+ <source srcset="/assets/patterns/img/patterns/molecules/nav-primary-search-ic.350bcf38.svg"
+ type="image/svg+xml"
+ >
+
+
+
+ <img srcset="/assets/patterns/img/patterns/molecules/nav-primary-search-ic_2x.0635c16f.png 2x, /assets/patterns/img/patterns/molecules/nav-primary-search-ic_1x.8e357583.png 1x"
+ src="/assets/patterns/img/patterns/molecules/nav-primary-search-ic_1x.8e357583.png"
+
+ alt="" />
+
+
+ </picture>
+
+
+ <span class="visuallyhidden nav-primary__menu_text"> Search the eLife site </span>
+
+ </a>
+
+
+ </li>
+ </ul>
+ </nav>
+
+ </div>
+
+
+ <div class="search-box" data-behaviour="SearchBox">
+ <div class="search-box__inner">
+ <form class="compact-form" id="search" action="/search" method="GET" novalidate>
+ <fieldset class="compact-form__container">
+ <label>
+ <span class="visuallyhidden">Search by keyword or author</span>
+ <input type="search" name="for" value="" placeholder="Search by keyword or author"
+
+ class="compact-form__input"
+
+ >
+ </label>
+
+
+ <button type="reset" name="reset" class="compact-form__reset"><span class="visuallyhidden">Reset form</span></button>
+ <button type="submit" class="compact-form__submit"><span class="visuallyhidden">Search</span></button>
+ </fieldset>
+ </form>
+
+ <label class="search-box__search_option_label">
+ <input type="checkbox" name="subjects[]" value="neuroscience" form="search">Limit my search to Neuroscience
+ </label>
+
+ </div>
+ </div>
+
+</header>
+
+ </div>
+
+
+
+ <main role="main" class="main" id="maincontent">
+
+
+ <header
+ class="content-header wrapper content-header--header content-header--has-social-media-sharers clearfix"
+ data-behaviour="ContentHeader">
+
+
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <a href="/subjects/neuroscience" class="content-header__subject_link">
+ <span class="content-header__subject">Neuroscience</span>
+ </a>
+ </li>
+ </ol>
+
+ <ul class="content-header__icons">
+ <li><a href="https://en.wikipedia.org/wiki/Open_access"
+ class="content-header__icon content-header__icon--oa"><span
+ class="visuallyhidden">Open access</span></a></li>
+ <li><a href="https://creativecommons.org/licenses/by/4.0/"
+ class="content-header__icon content-header__icon--cc"><span
+ class="visuallyhidden">Copyright information</span></a></li>
+ </ul>
+
+ <a href="#downloads" class="content-header__download_link">
+ <picture>
+ <source srcset="/assets/patterns/img/icons/download-full.6691999e.svg" type="image/svg+xml" media="(min-width: 45.625em)">
+ <source srcset="/assets/patterns/img/icons/download-full-2x.a54fbeb0.png" type="image/png" media="(min-width: 45.625em)">
+ <source srcset="/assets/patterns/img/icons/download.ecfa2d98.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/download-full-1x.5485093b.png" class="content-header__download_icon" alt="Download icon">
+ </picture>
+ </a>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--short">Parallel visual circuitry in a basal chordate</h1>
+
+
+ <div class="social-media-sharers">
+
+
+ <a class="social-media-sharer" href="https://facebook.com/sharer/sharer.php?u=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_blank" rel="noopener noreferrer" aria-label="Share on Facebook">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--facebook social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18.77 7.46H14.5v-1.9c0-.9.6-1.1 1-1.1h3V.5h-4.33C10.24.5 9.5 3.44 9.5 5.32v2.15h-3v4h3v12h5v-12h3.85l.42-4z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ <a class="social-media-sharer" href="https://twitter.com/intent/tweet/?text=Parallel%20visual%20circuitry%20in%20a%20basal%20chordate&amp;url=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_blank" rel="noopener noreferrer" aria-label="Tweet a link to this page">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--twitter social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M23.44 4.83c-.8.37-1.5.38-2.22.02.93-.56.98-.96 1.32-2.02-.88.52-1.86.9-2.9 1.1-.82-.88-2-1.43-3.3-1.43-2.5 0-4.55 2.04-4.55 4.54 0 .36.03.7.1 1.04-3.77-.2-7.12-2-9.36-4.75-.4.67-.6 1.45-.6 2.3 0 1.56.8 2.95 2 3.77-.74-.03-1.44-.23-2.05-.57v.06c0 2.2 1.56 4.03 3.64 4.44-.67.2-1.37.2-2.06.08.58 1.8 2.26 3.12 4.25 3.16C5.78 18.1 3.37 18.74 1 18.46c2 1.3 4.4 2.04 6.97 2.04 8.35 0 12.92-6.92 12.92-12.93 0-.2 0-.4-.02-.6.9-.63 1.96-1.22 2.56-2.14z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ <a class="social-media-sharer" href="mailto:?subject=Parallel%20visual%20circuitry%20in%20a%20basal%20chordate&amp;body=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_self" aria-label="Email a link to this page (opens up email program, if configured on this system)">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--email social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M22 4H2C.9 4 0 4.9 0 6v12c0 1.1.9 2 2 2h20c1.1 0 2-.9 2-2V6c0-1.1-.9-2-2-2zM7.25 14.43l-3.5 2c-.08.05-.17.07-.25.07-.17 0-.34-.1-.43-.25-.14-.24-.06-.55.18-.68l3.5-2c.24-.14.55-.06.68.18.14.24.06.55-.18.68zm4.75.07c-.1 0-.2-.03-.27-.08l-8.5-5.5c-.23-.15-.3-.46-.15-.7.15-.22.46-.3.7-.14L12 13.4l8.23-5.32c.23-.15.54-.08.7.15.14.23.07.54-.16.7l-8.5 5.5c-.08.04-.17.07-.27.07zm8.93 1.75c-.1.16-.26.25-.43.25-.08 0-.17-.02-.25-.07l-3.5-2c-.24-.13-.32-.44-.18-.68s.44-.32.68-.18l3.5 2c.24.13.32.44.18.68z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ <a class="social-media-sharer" href="https://reddit.com/submit/?title=Parallel%20visual%20circuitry%20in%20a%20basal%20chordate&amp;url=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_blank" rel="noopener noreferrer" aria-label="Share this page on Reddit">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--reddit social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M24 11.5c0-1.65-1.35-3-3-3-.96 0-1.86.48-2.42 1.24-1.64-1-3.75-1.64-6.07-1.72.08-1.1.4-3.05 1.52-3.7.72-.4 1.73-.24 3 .5C17.2 6.3 18.46 7.5 20 7.5c1.65 0 3-1.35 3-3s-1.35-3-3-3c-1.38 0-2.54.94-2.88 2.22-1.43-.72-2.64-.8-3.6-.25-1.64.94-1.95 3.47-2 4.55-2.33.08-4.45.7-6.1 1.72C4.86 8.98 3.96 8.5 3 8.5c-1.65 0-3 1.35-3 3 0 1.32.84 2.44 2.05 2.84-.03.22-.05.44-.05.66 0 3.86 4.5 7 10 7s10-3.14 10-7c0-.22-.02-.44-.05-.66 1.2-.4 2.05-1.54 2.05-2.84zM2.3 13.37C1.5 13.07 1 12.35 1 11.5c0-1.1.9-2 2-2 .64 0 1.22.32 1.6.82-1.1.85-1.92 1.9-2.3 3.05zm3.7.13c0-1.1.9-2 2-2s2 .9 2 2-.9 2-2 2-2-.9-2-2zm9.8 4.8c-1.08.63-2.42.96-3.8.96-1.4 0-2.74-.34-3.8-.95-.24-.13-.32-.44-.2-.68.15-.24.46-.32.7-.18 1.83 1.06 4.76 1.06 6.6 0 .23-.13.53-.05.67.2.14.23.06.54-.18.67zm.2-2.8c-1.1 0-2-.9-2-2s.9-2 2-2 2 .9 2 2-.9 2-2 2zm5.7-2.13c-.38-1.16-1.2-2.2-2.3-3.05.38-.5.97-.82 1.6-.82 1.1 0 2 .9 2 2 0 .84-.53 1.57-1.3 1.87z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ </div>
+
+ </div>
+
+ <div class="content-header__authors">
+ <ol class="content-header__author_list" aria-label="Authors of this article">
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x8d8d9914" data-behaviour="Popup" class="content-header__author_link">Matthew J Kourakis</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#xf3e51472" data-behaviour="Popup" class="content-header__author_link">Cezar Borba</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#xb536f34f" data-behaviour="Popup" class="content-header__author_link">Angela Zhang</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x1d85dfc3" data-behaviour="Popup" class="content-header__author_link">Erin Newman-Smith</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x6107dd5d" data-behaviour="Popup" class="content-header__author_link">Priscilla Salas</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x8b937bbf" data-behaviour="Popup" class="content-header__author_link">B Manjunath</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#xa3814a31" data-behaviour="Popup" class="content-header__author_link">William C Smith</a><span class="content-header__author_suffix">&nbsp;<picture>
+ <source srcset="/assets/patterns/img/icons/corresponding-author.d7eda27b.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/corresponding-author@1x.89247d49.png"
+ srcset="/assets/patterns/img/icons/corresponding-author@2x.808ab270.png 2x, /assets/patterns/img/icons/corresponding-author@1x.89247d49.png 1x"
+ alt="Is a corresponding author" class="content-header__author_icon">
+ </picture><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ </ol>
+
+ <ol class="content-header__institution_list" aria-label="Author institutions">
+ <li class="content-header__institution_list_item">
+ <span class="content-header__institution">University of California, Santa Barbara, United States<span class="content-header__institution_separator" aria-hidden="true">;</span>
+ </span>
+ </li>
+ </ol>
+ </div>
+
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/research-article" >Research Article</a>
+
+
+
+ <span class="date"> <time datetime="2019-04-18">Apr 18, 2019</time></span>
+ </div>
+ </div>
+
+
+</header>
+
+
+
+
+
+
+ <div class="wrapper">
+
+ <div class="contextual-data">
+
+ <ul class="contextual-data__list" aria-label="The following contains the number of views, citations and annotations in this article">
+
+ <li class="contextual-data__item"><a href="/articles/44753#metrics">Cited 0</a></li>
+ <li class="contextual-data__item"><a href="/articles/44753#metrics">Views 807</a></li>
+
+ <li class="contextual-data__item" data-hypothesis-trigger><span class="contextual-data__item__hypothesis_opener">Annotations</span> <button class="speech-bubble speech-bubble--small "
+ data-behaviour="SpeechBubble HypothesisOpener"
+
+aria-live="polite">
+ <span class="speech-bubble__inner"><span aria-hidden="true"><span data-visible-annotation-count></span></span><span class="visuallyhidden"> Open annotations. The current annotation count on this page is <span data-hypothesis-annotation-count>being calculated</span>.</span></span>
+</button>
+</li>
+
+ </ul>
+
+ <div class="contextual-data__cite_wrapper">
+ <span class="contextual-data__cite"><span class="contextual-data__cite_label">Cite <span class="visuallyhidden"> this article</span> as:</span> eLife 2019;8:e44753</span>
+ <span class="doi">doi: <a href="https://doi.org/10.7554/eLife.44753" class="doi__link">10.7554/eLife.44753</a></span>
+ </div>
+
+</div>
+
+
+ </div>
+
+
+
+ <div data-behaviour="DelegateBehaviour" data-delegate-behaviour="Popup" data-selector=".article-section:not(#abstract) a">
+
+
+ <div class="wrapper wrapper--content">
+
+ <div class="grid">
+
+
+
+ <div class="grid__item one-whole x-large--two-twelfths">
+
+ <div class="view-selector view-selector--has-figures" data-behaviour="ViewSelector" data-side-by-side-link="https://lens.elifesciences.org/44753">
+ <ul class="view-selector__list">
+ <li class="view-selector__list-item view-selector__list-item--article view-selector__list-item--active">
+ <a href="/articles/44753" class="view-selector__link view-selector__link--article"><span>Article</span></a>
+ </li>
+ <li class="view-selector__list-item view-selector__list-item--figures">
+ <a href="/articles/44753/figures" class="view-selector__link view-selector__link--figures"><span>Figures and data</span></a>
+ </li>
+
+ <li class="view-selector__list-item view-selector__list-item--jump">
+ <span class="view-selector__jump_links_header">Jump to</span>
+ <ul class="view-selector__jump_links">
+ <li class="view-selector__jump_link_item">
+ <a href="#abstract" class="view-selector__jump_link">Abstract</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s1" class="view-selector__jump_link">Introduction</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s2" class="view-selector__jump_link">Results</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s3" class="view-selector__jump_link">Discussion</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s4" class="view-selector__jump_link">Materials and methods</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#references" class="view-selector__jump_link">References</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#SA1" class="view-selector__jump_link">Decision letter</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#SA2" class="view-selector__jump_link">Author response</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#info" class="view-selector__jump_link">Article and author information</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#metrics" class="view-selector__jump_link">Metrics</a>
+ </li>
+ </ul>
+ </li>
+
+ </ul>
+</div>
+
+ </div>
+
+
+ <div class="content-container grid__item one-whole
+
+ large--eight-twelfths x-large--seven-twelfths
+ grid-column">
+
+
+
+
+
+ <section
+ class="article-section article-section--first"
+ id="abstract"
+ data-behaviour="ArticleSection"
+
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Abstract</h2>
+ </header>
+
+ <div class="article-section__body">
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="s2"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Results</h2>
+ </header>
+
+ <div class="article-section__body">
+ <section
+ class="article-section "
+ id="s2-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Glutamatergic and GABAergic photoreceptors</h3>
+ </header>
+
+ <div class="article-section__body">
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-2"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Posterior brain vesicle relay neurons are mixed VGAT- and VACHT-expressing</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Sensory input from the photoreceptors, antenna cells, coronet cells, bipolar tail neurons and a subset of peripheral neurons is directed to a cluster of ~30 RNs in the pBV. These RNs in turn extend axons through the neck to the MG. Among this cluster are the six prRNs and eight pr-AMG RNs (<a href="#fig1">Figure 1</a>; (<a href="#bib43">Ryan et al., 2016</a>)). Previous in situ hybridization studies identified VGAT- and VACHT-expressing neurons in the appropriate place in the BV to be RNs (<a href="#bib56">Yoshida et al., 2004</a>). Moreover, these neurons project axons posteriorly to the MG, a defining characteristic of the pBV RNs. BV neurons expressing other major NTs, including glutamate, dopamine, and serotonin, are neither in the correct brain region to be RNs, nor do they project from the BV to the MG ([<a href="#bib20">Horie et al., 2008b</a>; <a href="#bib31">Moret et al., 2005</a>; <a href="#bib39">Pennati et al., 2007</a>], and our observations). By HCR in situ we observed that the pBV RNs cluster in two distinct groups along the anterior/posterior axis, with the anterior cluster expressing VACHT, and the posterior group expressing VGAT (<a href="#fig3">Figure 3a</a>). We observed an average of 16 (±1.6, n = 9 larvae) VGAT-positive neurons and 11 (±1, n = 8 larvae) VACHT-positive neurons.</p>
+ <div
+ id="fig3"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig3"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1500,/0/default.jpg"
+ data-asset-viewer-width="1500"
+ data-asset-viewer-height="1109"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 3</span> with 1 supplement <a href="/articles/44753/figures#fig3" class="asset-viewer-inline__header_link">see all</a>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzMtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig3-v2.jpg?_hash=poB6zI7Tss9wKOFGYwhd40WSG4X9%2B4%2FgYw9ffJwpELo%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1500,/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1500,/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Neurotransmitter use in the relay neurons.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b>) In situ hybridization of VGAT and VACHT to the relay neurons in the brain vesicle. Also visible is the anterior tip of the motor ganglion. Nuclei are shown as spheres. (<b>b</b>) Confusion matrix for relay neuron registration. (<b>c</b>) Confusion matrix for relay neurons grouped by type. (<b>d</b>) Heat map of neurotransmitter predictions from cell registration of relay neurons, with scale showing color by proportion of iterations predicting either VGAT or VACHT. Abbreviations: ant., anterior; post., posterior; dor., dorsal; vent., ventral; MG, motor ganglion; pr-AMG RN, photoreceptor ascending motor ganglion relay neuron; prRN, photoreceptor relay neuron; AntRN, antenna cell relay neuron; PBRN, photoreceptor-bipolar tail neuron relay neuron; PCRN, photoreceptor-coronet relay neuron; PNRN, peripheral relay neuron; VGAT, vesicular GABA transporter; VACHT, vesicular acetylcholine transporter.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.007" class="doi__link">https://doi.org/10.7554/eLife.44753.007</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">Unlike the ocellus, the pBV RN cluster does not have obvious anatomical features, although the various classes of RNs are clustered, with, for example, the antenna cell RNs (AntRN) being posterior to the photoreceptor RNs (<a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>; <a href="#bib43">Ryan et al., 2016</a>). However, given the diversity of RN types in the pBV it is unlikely that the expression domains of VGAT and VACHT precisely correspond to the clusters of RN classes. In order to make predictions of NT use in the RNs we used the same registration approach as with the photoreceptors (n = 7 VGAT/VACHT double in situ datasets, <a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>). The confusion matrix for the RNs shows a lower level of convergence than for the PR-Is, suggesting that the cellular anatomy of the RN cluster is less structured than the ocellus (<a href="#fig3">Figure 3b</a>; <a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>). However, the confusion matrix also shows that the RNs are most often confused for other RNs of the same class (white boxes in <a href="#fig3">Figure 3b</a>). This is most evident when the registration is performed not with single cells, but with pooled RNs of each class (<a href="#fig3">Figure 3c</a>), and is presumably a reflection of the clustering of RN classes in the pBV. Thus we can have higher confidence in the NT use by RN class than we can have in individual neuron identities. For example, the connectome shows the AntRNs are clustered at the rear of the BV (<a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>; (<a href="#bib43">Ryan et al., 2016</a>)), as are the VGAT expressing neurons (<a href="#fig3">Figure 3a</a>; <a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>). Accordingly, the registration predicts that eight of the ten AntRNs are VGAT positive (<a href="#fig3">Figure 3c</a>). For the present study, which focuses on the visuomotor pathway, the registration predicts that five of the eight pr-AMG RNs are VGAT expressing, two are VACHT expressing, and one (pr-AMG RN 157) cannot be resolved (no dual VGAT/VACHT expression was observed in the <i>in situs</i>). On the other hand, the registration predicts that the six prRNs are evenly mixed between VGAT and VACHT expression. These predictions provide starting points for experimental validation detailed below.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-3"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">The motor ganglion contains a mixture of cholinergic and GABAergic neurons</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">The MG contains five left/right pairs of motor neurons, as well as several classes of interneurons, including six MGINs, seven AMGs, two ddNs, and two posterior MG interneurons (<a href="#bib43">Ryan et al., 2016</a>). Also described in the MG are two left/right pairs of decussating VGAT-positive neurons (<a href="#bib21">Horie et al., 2009</a>; <a href="#bib35">Nishino et al., 2010</a>). These are likely the same decussating MG neurons as described in the connectome, although the names are slightly different (<i>anterior caudal inhibitory neurons</i> (<a href="#bib21">Horie et al., 2009</a>) versus <i>ascending contralateral inhibitory neurons</i> (<a href="#bib43">Ryan et al., 2016</a>), both abbreviated as ACIN). However, the connectome reports only three ACINs, with the anterior ACIN not paired. It was speculated that this was an anomalous feature of the particular larva used for the ssEM. Supporting this, a second larva being analyzed by ssEM for connectomics shows two pairs of ACINs (K. Ryan, personal communication).</p>
+<p class="paragraph">Like the ocellus, the MG has a well-defined anterior-to-posterior and dorsal-to-ventral cellular anatomy (<a href="#fig4">Figure 4a and b</a>; <a href="#bib43">Ryan et al., 2016</a>; <a href="#bib45">Ryan et al., 2018</a>). Neurotransmitter use by some MG neurons is already documented, including the motor neurons, which are cholinergic (<a href="#bib53">Takamura et al., 2010</a>; <a href="#bib52">Takamura et al., 2002</a>), and the ACINs which are glycinergic (<a href="#bib35">Nishino et al., 2010</a>). By HCR in situ hybridization we observed VGAT- and VACHT-positive neurons in the MG (<a href="#fig4">Figure 4b</a>), but no VGLUT- or TH-positive cells (data not shown). These results are consistent with previous studies (<a href="#bib20">Horie et al., 2008b</a>; <a href="#bib31">Moret et al., 2005</a>). Likewise it was reported that no serotonergic cells were present in the MG (<a href="#bib39">Pennati et al., 2007</a>). As with the RNs, the VGAT- and VACHT-expressing neurons in the MG are segregated anatomically. We also found a population of 6–7 cells between the AMGs and the MNs (asterisks in <a href="#fig4">Figure 4a</a>), that were not annotated in the connectome as neurons and that failed to label with any of our NT markers. We hypothesize that these are ependymal cells, which are abundant in the nerve cord immediately caudal to this region.</p>
+ <div
+ id="fig4"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig4"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/,1500/0/default.jpg"
+ data-asset-viewer-width="1274"
+ data-asset-viewer-height="1500"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 4</span> with 1 supplement <a href="/articles/44753/figures#fig4" class="asset-viewer-inline__header_link">see all</a>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzQtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig4-v2.jpg?_hash=ywXgEBLsOzGfI3rEs2OHLvcZSgqwkJ8EhBicEWmfAJ8%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/,1500/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/,1500/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Neurotransmitter use in the motor ganglion.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b> and <b>b</b>) Expression of VGAT and VACHT by in situ hybridization in the motor ganglion, lateral (<b>a</b>) and dorsal (<b>b</b>) views. Asterisks indicate predicted ependymal cells. (<b>c</b>) Lateral view of VGAT expression in the AMGs. (<b>d</b>) shows same view as c, but with VACHT expression. (<b>e</b>) Diagram of neurons in the motor ganglion (derived from Figure 1 of <a href="#bib44">Ryan et al., 2017</a>). Box indicates approximate positions of panels c and d. Lateral view; anterior is to the left. (<b>f</b>) Dorsal view of VGAT expression in the AMGs. Asterisk indicates central non-VGAT expressing cell. (<b>g</b>) Three dimensional surface rendering of VGAT expressing cells in the AMGs. (<b>h</b>) Diagram of a dorsal view of the motor ganglion. AMG cells are numbered. Abbreviations: dor., dorsal; vent., ventral; ant., anterior; post., posterior; AMG, ascending motor ganglion neuron; MGIN, motor ganglion interneuron; ddN, descending decussating neurons; ACIN, ascending contralateral inhibitory neurons; MN, motor neuron; VGAT, vesicular GABA transporter; VACHT, vesicular acetylcholine transporter.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.009" class="doi__link">https://doi.org/10.7554/eLife.44753.009</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">Because of the highly structured MG cellular anatomy, we can identify the various MG cell types in the in situ data. The anterior group of VGAT-positive cells is clustered dorsally in the MG, and correspond to AMGs (4 c, d and e; (<a href="#bib44">Ryan et al., 2017</a>)). In a dorsal view of the MG (<a href="#fig4">Figure 4f,g and h</a>) a ring of VGAT-positive cells was observed with a non-VGAT expressing cell in the center (asterisk, <a href="#fig4">Figure 4f and g</a>). The VGAT-expressing cells appear to be AMGs 1, 2, 3, 4, 6, and 7, while the central cell, which is instead positive for VACHT, appears to be AMG5. The connectome shows that AMG5 differs in its connectivity from the other AMGs. Significantly, AMG5 is the principle synaptic input for PNS neurons. It then synapses to the other AMGs, which in turn project their axons to other cells in the MG, including MGINs and MNs, as well as to the pr-AMG RNs in the BV. In the posterior of the MG we observed two pairs of VGAT-positive neurons, as described previously (<a href="#bib21">Horie et al., 2009</a>). Finally, in the ventral MG we observed a continuous block of VACHT expression that encompasses the anterior three pairs of MNs, the ddNs, and the MGINs. Similar in situ patterns were observed in most larvae (<a href="/articles/44753/figures#fig4s1">Figure 4—figure supplement 1</a>), although the positions of the ACINs were offset in several (see larvae 5 and 6 in <a href="/articles/44753/figures#fig4s1">Figure 4—figure supplement 1</a>), and one larva was observed to be missing both one motor neuron and one ACIN (larva 7in <a href="/articles/44753/figures#fig4s1">Figure 4—figure supplement 1</a>), suggesting that MG variants, such as was observed in the animal used in the connectome study, may be relatively common.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-4"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Parallel visuomotor circuits</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Our results indicate that the PR-Is, with the exception of two cells, are glutamatergic, while the PR-IIs are a mixture of GABAergic and GABA/glutamatergic. The <i>Ciona</i> genome contains a single glutamate AMPA receptor (AMPAR) (<a href="#bib36">Okamura et al., 2005</a>) that is expressed in larvae in the two antenna cells, and in a small cluster of neurons in the pBV (<a href="#bib18">Hirai et al., 2017</a>). Published results show that most of the pBV group of AMPAR-positive neurons are clustered at the ends of Arrestin-labeled photoreceptor axons, and that they extend their axons to the MG, suggesting they are photoreceptor RNs (see Figure 2B" in <a href="#bib18">Hirai et al., 2017</a>). We find that this pBV group is composed of ~6 cells (<a href="/articles/44753/figures#fig5s1">Figure 5—figure supplement 1</a>). To investigate this further, we co-expressed an pAMPAR &gt;GFP construct (<a href="#bib18">Hirai et al., 2017</a>) with pVACHT &gt;CFP and pVGAT &gt;nuclear RFP constructs. We observed coexpression of the AMPAR reporter in a subset of the VACHT-positive RNs, but never in the VGAT-expressing RNs (<a href="#fig5">Figure 5a</a>).</p>
+ <div
+ id="fig5"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig5"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/,1500/0/default.jpg"
+ data-asset-viewer-width="1400"
+ data-asset-viewer-height="1500"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 5</span> with 1 supplement <a href="/articles/44753/figures#fig5" class="asset-viewer-inline__header_link">see all</a>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzUtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig5-v2.jpg?_hash=0aDXVHgnozrp0Q80t8%2FT5K718EzJQmgreXzfYFK9oAQ%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/,1500/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/,1500/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">AMPA receptors in negative phototaxis.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b>) Coexpression of an AMPA-receptor and VACHT expression constructs in the relay neurons (white asterisks). The main panel shows the merge while smaller panels at right show single channels. (<b>b</b>) Negative phototaxis assay in control larvae. Yellow arrow indicates direction of 505 nm light. By 60 min (m) the majority of the larvae have swum to the side of the dish away from the light (red arrow). (<b>c</b>) Perampanel-treated larvae do not show negative phototaxis. (<b>d</b>) Quantification of negative phototaxis in control and perampanel-treated larvae. Points indicate the averages from three independent assays, ±standard deviation. Y-axis represents the percentage of larvae found on the side away from the light source (distal third). Abbreviations: VGAT, vesicular GABA transporter; VACHT, vesicular acetylcholine transporter.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.011" class="doi__link">https://doi.org/10.7554/eLife.44753.011</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">To assess the function of the AMPAR-positive cells in <i>Ciona</i> visuomotor behaviors we used the non-competitive AMPAR antagonist perampanel (<a href="#bib17">Hanada et al., 2011</a>). For the assay, larvae were treated at 25 hr post fertilization (hpf) with perampanel in sea water and compared to vehicle-treated control larvae for both negative phototaxis and response to light dimming. The negative phototaxis assay consisted of placing the larvae in a 10 cm petri dish of sea water with a 505 nm LED lamp placed to one side (described by us previously <a href="#bib46">Salas et al., 2018</a>). Images were collected at 1 min intervals over 5 hr to assess for taxis (<a href="#video1">Video 1</a>). <a href="#fig5">Figure 5b and c</a> show representative frames from the time-lapse capture at the start and at 60 min for control and perampanel-treated larvae, respectively. In the control sample the larvae at 60 min were observed to cluster at the side of the petri dish away from the light (distal side; red arrows in <a href="#fig5">Figure 5b</a>). By contrast no taxis was observed in the perampanel treated larvae (<a href="#fig5">Figure 5c</a>). Combined results from three independent assays (n = 129–365 larvae per group) are shown in <a href="#fig5">Figure 5d</a> and presented as the percent of larvae found on distal third of the petri dish. For control larvae ~ 70% swam to the distal third within 1 hr, while the perampanel-treated larvae remained evenly distributed across the dish.</p>
+ <div
+ id="video1"
+ class="asset-viewer-inline asset-viewer-inline--video "
+ data-variant="video"
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="video1"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Video 1</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9zdGF0aWMtbW92aWUtdXNhLmdsZW5jb2Vzb2Z0d2FyZS5jb20vbXA0LzEwLjc1NTQvODg2Lzc3MWIyN2VkMjZmNzI1MTEwOGJkMzViODQyY2U1OTYzZTYzNDExOTkvZWxpZmUtNDQ3NTMtdmlkZW8xLm1wNA==/elife-44753-video1.mp4?_hash=QotDe4lMfotdXdc%2BUKEblblnp1b0B6bupqA3BcEJbnU%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+
+
+ <video controls="controls" poster="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video1.jpg/full/639,/0/default.jpg" preload="metadata">
+
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video1.jpg/full/639,/0/default.jpg" alt="posterframe for video" />
+
+ <p>This video cannot be played in place because your browser does support HTML5 video. You may still download the video for offline viewing.</p>
+
+ <source src="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.mp4" type='video/mp4; codecs=&quot;avc1.42E01E, mp4a.40.2&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.mp4">Download as MPEG-4</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.webm" type='video/webm; codecs=&quot;vp8.0, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.webm">Download as WebM</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.ogv" type='video/ogg; codecs=&quot;theora, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.ogv">Download as Ogg</a>
+
+ </div>
+
+ </video>
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Negative phototaxis of control and perampanel-treated <i>Ciona</i> larvae in 10 cm petri dishes.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">Directional 505 nm illumination is from the left. Frames were taken at 1 per minute over five hours. In the video the 5 hr is compressed to 15 s (i.e., 1200X normal speed). Black and white tones were inverted to make the larvae more visible.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.014" class="doi__link">https://doi.org/10.7554/eLife.44753.014</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">The inability of the perampanel-treated larvae to undergo phototaxis was not the result of an inability to swim, as seen in <a href="#video2">Video 2</a> which was taken at 8.9 fps, with and without perampanel. Moreover, we observed that perampanel treatment had no effect on the light dimming response (<a href="#video3">Video 3</a>). <a href="#fig6">Figure 6a and b</a> show 5 s projection images from <a href="#video3">Video 3</a> immediately before and after dimming. In these images swims appear as lines, and the responses in control and perampanel-treated larvae appear qualitatively similar. To quantitatively compare dimming response, control and perampanel-treated larvae were exposed to a range of dimming intensities from 2 to 60-fold and the percentage of larvae responding was measured and presented as a percentage in <a href="#fig6">Figure 6c</a> (results are from three independent assays, with 46–139 larvae per group). The percentage responding at all intensities was very similar for both groups, and pair-wise comparisons at each fold change failed to show significance. In addition, no differences were measured in the velocity or duration of swims in pair-wise comparisons of control and perampanel-treated larvae at any fold-dimming (data not shown). We conclude that there is no change in sensitivity to dimming caused by perampanel treatment, while phototaxis was completely disrupted. Finally, we also observed that the touch response was not inhibited by perampanel (data not shown), despite the presence of VGLUT-positive epidermal sensory neurons (<a href="#bib20">Horie et al., 2008b</a>). This would appear to agree with the observation that primary RNs for the PNS, the eminens cells and the AMGs do not express the AMPAR (<a href="#bib18">Hirai et al., 2017</a>; and our observations). In addition to the AMPAR, the <i>Ciona</i> genome contains several other glutamate receptors including one kainate and one NMDA (<a href="#bib36">Okamura et al., 2005</a>), although their expression has not been characterized.</p>
+ <div
+ id="fig6"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig6"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1500,/0/default.jpg"
+ data-asset-viewer-width="1500"
+ data-asset-viewer-height="1124"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 6</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzYtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig6-v2.jpg?_hash=v4N145cqneQAaynzcjF%2FnAcen6AeM%2BkieeWEMRTMIFY%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1500,/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1500,/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Perampanel does not disrupt the light dimming response.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b>) Light dimming response in control larvae. Shown are 5 s (s) projections from time-lapse videos in which swims appear as lines. Left panel shows a projection 5 s before dimming, and right panel 5 s after dimming. (<b>b</b>) same as a, but larvae were perampanel-treated. (<b>c</b>) Quantification of light dimming response in control and perampanel treated larvae. Larvae were exposed to dimming of 505 nm light from 2- to 60-fold. Dimming response was scored as percent of larvae responding. Bars show averages of three independent assays ± standard deviation.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.015" class="doi__link">https://doi.org/10.7554/eLife.44753.015</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+ <div
+ id="video2"
+ class="asset-viewer-inline asset-viewer-inline--video "
+ data-variant="video"
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="video2"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Video 2</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9zdGF0aWMtbW92aWUtdXNhLmdsZW5jb2Vzb2Z0d2FyZS5jb20vbXA0LzEwLjc1NTQvODg2Lzc3MWIyN2VkMjZmNzI1MTEwOGJkMzViODQyY2U1OTYzZTYzNDExOTkvZWxpZmUtNDQ3NTMtdmlkZW8yLm1wNA==/elife-44753-video2.mp4?_hash=Td7A8NTjXLYjBj4RDkmj2zYo4EbpSS%2FCisGNZPPo1Ws%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+
+
+ <video controls="controls" poster="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video2.jpg/full/639,/0/default.jpg" preload="metadata">
+
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video2.jpg/full/639,/0/default.jpg" alt="posterframe for video" />
+
+ <p>This video cannot be played in place because your browser does support HTML5 video. You may still download the video for offline viewing.</p>
+
+ <source src="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.mp4" type='video/mp4; codecs=&quot;avc1.42E01E, mp4a.40.2&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.mp4">Download as MPEG-4</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.webm" type='video/webm; codecs=&quot;vp8.0, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.webm">Download as WebM</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.ogv" type='video/ogg; codecs=&quot;theora, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.ogv">Download as Ogg</a>
+
+ </div>
+
+ </video>
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Swimming of control and perampanel-treated <i>Ciona</i> larvae in a directional light field.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">Larvae in 10 cm petri dishes were recorded at nine frames/second. Black and white tones were inverted to make the larvae more visible. The video plays at 5X normal speed.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.017" class="doi__link">https://doi.org/10.7554/eLife.44753.017</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+ <div
+ id="video3"
+ class="asset-viewer-inline asset-viewer-inline--video "
+ data-variant="video"
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="video3"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Video 3</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9zdGF0aWMtbW92aWUtdXNhLmdsZW5jb2Vzb2Z0d2FyZS5jb20vbXA0LzEwLjc1NTQvODg2Lzc3MWIyN2VkMjZmNzI1MTEwOGJkMzViODQyY2U1OTYzZTYzNDExOTkvZWxpZmUtNDQ3NTMtdmlkZW8zLm1wNA==/elife-44753-video3.mp4?_hash=GCjCBk4K%2BjkExSoFO0Wc8pv%2FhPFCkRZ84QpH%2Bavz994%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+
+
+ <video controls="controls" poster="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video3.jpg/full/639,/0/default.jpg" preload="metadata">
+
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video3.jpg/full/639,/0/default.jpg" alt="posterframe for video" />
+
+ <p>This video cannot be played in place because your browser does support HTML5 video. You may still download the video for offline viewing.</p>
+
+ <source src="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.mp4" type='video/mp4; codecs=&quot;avc1.42E01E, mp4a.40.2&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.mp4">Download as MPEG-4</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.webm" type='video/webm; codecs=&quot;vp8.0, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.webm">Download as WebM</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.ogv" type='video/ogg; codecs=&quot;theora, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.ogv">Download as Ogg</a>
+
+ </div>
+
+ </video>
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Dimming response of control and perampanel-treated <i>Ciona</i> larvae in 10 cm petri dishes.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">Larvae were imaged for 70 s at five frames/second, with dimming of 505 nm ambient light at 10 s. Black and white tones were inverted, and thus the dimming appears as a brightening. The video plays at 5X normal speed.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.018" class="doi__link">https://doi.org/10.7554/eLife.44753.018</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">In summary, we are able to separate the phototaxis and dimming behaviors pharmacologically. Moreover, we can identify the VACHT/AMPAR-positive RNs as essential for an excitatory PR-I circuit that involves presynaptic glutamatergic PR-Is and postsynaptic cholinergic MGINs. The number and location of the VACHT/AMPAR-positive RNs, the circuit logic, and our behavioral observations are all consistent with these being prRNs.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-5"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">A disinhibitory circuit</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Of equal significance to our observation that navigation is inhibited by perampanel, is our observation that the dimming response, which is mediated by the PR-IIs (<a href="#bib46">Salas et al., 2018</a>), is not inhibited by perampanel (<a href="#fig6">Figure 6</a>). Our expression studies show that the PR-IIs are comprised of a mixture of VGAT- and VGAT/VGLUT-expressing photoreceptors. Although it is formally possible that PR-IIs signal exclusively via glutamate in an excitatory circuit via a non-AMPA glutamate receptor on their RNs, our observations that several of the PR-IIs are VGAT-only, as are the majority of the pr-AMG RNs, suggests an alternative disinhibitory circuitry logic. This circuit would consist of the inhibitory PR-IIs synapsing to the pr-AMG RNs to reduce their inhibition on the cholinergic MGINs.</p>
+<p class="paragraph">Implicit in the disinhibitory model is an autonomous level of motor activity in larvae that could be inhibited by the GABAergic pr-AMG RNs, and that this inhibition is released upon stimulation of the GABAergic PR-IIs. We investigated this possibility by two approaches. In the first approach, we inhibited GABAergic receptors with picrotoxin (<a href="#bib37">Olsen, 2014</a>), which should inhibit signals from the GABAergic photoreceptors and the pr-AMG RNs (and most likely the AntRNs), as well as PNS relay neurons, including the eminens cells and the AMGs. The ACINs, which are essential for the central pattern generator (<a href="#bib35">Nishino et al., 2010</a>), are glycinergic and should not be inhibited by picrotoxin. In the second approach, we took advantage of a previously described <i>Ciona</i> mutant, <i>frimousse (frm)</i> (<a href="#bib10">Deschet and Smith, 2004</a>; <a href="#bib16">Hackley et al., 2013</a>). In homozygous <i>frm</i> larvae the anterior BV is transfated to epidermis due to a null mutation in a neurula stage-specific connexin gene (<a href="#bib16">Hackley et al., 2013</a>). <i>Frm</i> larvae thus lack the ocellus pigment cell and photoreceptors, as well as the otolith, although the motor ganglion appears intact (<a href="#bib10">Deschet and Smith, 2004</a>; <a href="#bib16">Hackley et al., 2013</a>).</p>
+<section
+ class="article-section "
+ id="s2-5-1"
+>
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-4"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Hybridization chain reaction (HCR) in situ</h3>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-4" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph"><i>Ciona intestinalis</i>-type B were used for in situ studies and staged to match the animals used in the connectome study (<a href="#bib43">Ryan et al., 2016</a>). Optimized HCR in situ probes for each target transcript were obtained from Molecular Technologies. For detection of GABAergic/glycinergic cells, probes were made to the vesicular GABA transporter gene; for glutamatergic cells, probes were made to the vesicular glutamate transporter for cholinergic cells, probes were made to the vesicular acetylcholine transporter. The sequences from which the HCR probe sets were chosen were assembled from scaffold reads available through the Aniseed website (aniseed.cnrs.fr), and are shown in <a href="/articles/44753/figures#supp1">Supplementary file 1</a>. The in situ protocol followed the previously published <i>Ciona in situ</i> hybridization protocol (<a href="#bib8">Corbo et al., 1997</a>) until the prehybridization step. At this point, the protocol follows the published HCR protocol (<a href="#bib7">Choi et al., 2018</a>), with the following exception: during the amplification stage, incubation with hairpins is performed for 3 days instead of 12–16 hr.</p>
+<p class="paragraph">HCR in situ stained larvae were cleared with Slowfade Gold with DAPI (Invitrogen) and imaged on a Leica SP8 resonant scanning confocal microscope. Imaris v. 9.1 (Bitplane) was used to visualize embryos and assign centroids to nuclei using the ‘add new spots’ function, followed by manual correction when necessary. Nuclei were assigned using the maximum intensity projection, cropped to the area of interest. Volume rendering of in situ patterns was also done using Imaris v. 9.1.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-5"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Cell registration</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">A rotation matrix was calculated based on the 3-dimensional vectors between the anchor cells (ddN and/or antenna cells) and the center of the target cells (photoreceptors or relay neurons) using the HCR in situ (target set) and connectome cell centroids (source set). The source set was then rotated to an approximate orientation to the target set. Next, the Coherent Point Drift Algorithm was used to calculate an affine transformation matrix between the source set and the target set of cells (<a href="#bib32">Myronenko and Song, 2010</a>). This algorithm models the source set as a Gaussian Mixture Model (GMM), and the target set is treated as observations from the GMM. The transformation matrix is calculated to maximize the Maximum A Posteriori estimation that the observed point cloud is drawn from the GMM. A nearest neighbor mapping based on Euclidean distance is then used to find the closest corresponding point in the target cell set for each cell in the transformed source cell set. The implementation used was adapted from the pure Python implementation <a href="https://github.com/siavashk/pycpd">https://github.com/siavashk/pycpd</a>. The maximum number of iterations was set to 1000 and the maximum root mean squared error for convergence was set to 0.001. The code for the registration is available as supplementary material (<a href="/articles/44753/figures#scode1">Source codes 1</a>–<a href="/articles/44753/figures#scode3">3</a>).</p>
+<section
+ class="article-section "
+ id="s4-5-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Confusion matrix</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-5-1" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Each dataset containing NT information was registered to every other dataset of the same type using the algorithm detailed above. The EM-registration based cell assignments of each cell in both sets is then compared to each other to see if they agree (<a href="#bib49">Stehman, 1997</a>). The confusion matrix shows the number of times a cell assignment in one dataset corresponds with each other cell assignment in another dataset.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-6"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Behavioral assays</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">For time-lapse videos the inverted lid of a 60 mm petri dish was first coated with a thin layer of 1% agarose. Larvae were then added to the inverted lid with filtered sea water containing 0.1% BSA with streptomycin and kanamycin each at 20 μg/ml. Finally the dish was covered with a square of glass leaving no air at the top interface. Stock solutions of perampanel were dissolved in methanol and diluted to final concentrations of either 5 μm (Santa Cruz Biotech) or 15 µM (Adooq Bioscience) in filtered sea water/BSA/antibiotics. Picrotoxin (Tocris) was also diluted in methanol and used at a final concentration of 1 mM. Control samples received methanol alone.</p>
+<p class="paragraph">Time-lapse images were collected using a Hamamatsu Orca-ER camera fitted on a Navitar 7000 macro zoom lens. Programmable 700 nm and 505 nm LED lamps were used to illuminate the larvae (Mightex). All light intensity readings were taken with an Extech Instruments light meter.</p>
+<section
+ class="article-section "
+ id="s4-6-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Dimming-response</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-6-1" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">All larvae used were between 25 and 28 hpf (18°C). For image capture, the larvae were illuminated with the 700 nm LED lamp and the camera was fitted with a red filter to block the 505 nm light. The videos were recorded at five fps. In the assays, larvae were first recorded for 10 s with the 505 nm LED light mounted above the dish at 600 lux and then dimmed to specific values while image capture continued for another 3 min. Larvae were allowed to recover for 5 min before being assayed again.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-6-2"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Phototaxis</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-6-2" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">All larvae used were approximately 25 hpf (18°C). The 505 nm LED light was mounted to one side to the petri dish at approximately 3000 lux. Images were captured at one frame per minute for five hours, with the exception of 30 s capture session at 8.9 fps to assay swimming behavior.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-6-3"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Spontaneous Swims</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-6-3" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">All larvae used were between 26 and 28 hpf. The plates were illuminated with only a 700 nm LED light in order to record dark conditions. The videos were recorded at about 8.9 fps for one minute.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Behavioral data analysis</h3>
+ </header>
+
+ <div class="article-section__body">
+ <section
+ class="article-section "
+ id="s4-7-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Dim-response criteria</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-1" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Responses to light dimming were counted if: (1) the larva was stationary at the time of the light dimming, and (2) it swam for longer than 3 s. Three seconds was determined by measuring the duration of tail flicks as previously described (<a href="#bib46">Salas et al., 2018</a>). Larvae that bumped or brushed against other larvae or the dish edges were not counted.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7-2"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Tracking and quantification</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-2" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Larval swims were tracked using a custom MATLAB script named Estimators of Locomotion Iterations for Animal Experiments (ELIANE). Before uploading to ELIANE, time-lapse images were first processed with Fiji (ImageJ) by subtracting a minimum Z-projection to all the frames and then inverting black and white. ELIANE takes the processed time-lapse images and first creates a background image by averaging the pixels from all the frames. Next, it goes to the initial frame, subtracts the background image, and stores all remaining objects found in the specified region of interest (ROI) as initial objects. Then, analyzing one-by-one the initial objects, it goes frame-by-frame subtracting the background image and analyzing all objects to determine the new position of the object by comparing the Euclidean distances of it to all other objects in that frame. If the object had moved unrealistically fast (&gt;6.5 mm/s), moved outside the ROI, or did not move after a set time (1 min), the object was not analyzed. This MATLAB script can be found in the Supplemental Materials (<a href="/articles/44753/figures#scode4">Source code 4</a>).</p>
+<p class="paragraph">The spontaneous swims in the <i>frimousse</i> experiment were quantified manually.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7-3"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Sampling</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-3" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Assessment of larval swim parameters were performed using three independent assays. For the spontaneous swims, which were quantified manually, 25 larvae were selected randomly, starting from the center of the plate going outward, only using the ones that could be tracked for the entire minute recording session.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7-4"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Tests of significance</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-4" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Dimming response significance and swim frequency were calculated using the Wilcoxon rank-sum test; spontaneous swim time significance was calculated using the Student’s <i>t</i>-test; and the variance of spontaneous swimming significance was calculated using the F-test.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <button class="speech-bubble speech-bubble--has-placeholder"
+ data-behaviour="SpeechBubble HypothesisOpener"
+
+aria-live="polite">
+ <span class="speech-bubble__inner"><span aria-hidden="true"><span data-visible-annotation-count>&#8220;</span></span><span class="visuallyhidden"> Open annotations. The current annotation count on this page is <span data-hypothesis-annotation-count>being calculated</span>.</span></span>
+</button>
+
+
+
+ <section
+ class="article-section "
+ id="references"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">References</h2>
+ </header>
+
+ <div class="article-section__body">
+
+<ol class="reference-list">
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">1</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib1" id="bib1">
+
+
+ <a href="https://doi.org/10.1016/j.coisb.2017.12.005" class="reference__title">Fold-change detection in biological systems</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Adler%22" class="reference__authors_link">M Adler</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:U+Alon%22" class="reference__authors_link">U Alon</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2018)</span>
+
+ <div class="reference__origin"><i>Current Opinion in Systems Biology</i> <b>8</b>:81–89.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1016/j.coisb.2017.12.005" class="doi__link">https://doi.org/10.1016/j.coisb.2017.12.005</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Fold-change+detection+in+biological+systems&amp;author=M+Adler&amp;author=U+Alon&amp;publication_year=2018&amp;journal=Current+Opinion+in+Systems+Biology&amp;volume=8&amp;pages=pp.+81%E2%80%9389" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">2</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib2" id="bib2">
+
+
+ <a href="https://doi.org/10.1038/nrg2102" class="reference__title">Network motifs: <i>theory and experimental approaches</i></a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:U+Alon%22" class="reference__authors_link">U Alon</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2007)</span>
+
+ <div class="reference__origin"><i>Nature Reviews Genetics</i> <b>8</b>:450–461.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1038/nrg2102" class="doi__link">https://doi.org/10.1038/nrg2102</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/17510665" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Network+motifs%3A+theory+and+experimental+approaches&amp;author=U+Alon&amp;publication_year=2007&amp;journal=Nature+Reviews+Genetics&amp;volume=8&amp;pages=pp.+450%E2%80%93461&amp;pmid=17510665" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">3</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib3" id="bib3">
+
+
+
+
+ <div class="reference__title">Evolution of eyes and photoreceptor cell types</div>
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Arendt%22" class="reference__authors_link">D Arendt</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2003)</span>
+
+ <div class="reference__origin"><i>The International Journal of Developmental Biology</i> <b>47</b>:563–571.</div>
+
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/14756332" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Evolution+of+eyes+and+photoreceptor+cell+types&amp;author=D+Arendt&amp;publication_year=2003&amp;journal=The+International+Journal+of+Developmental+Biology&amp;volume=47&amp;pages=pp.+563%E2%80%93571&amp;pmid=14756332" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">4</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib4" id="bib4">
+
+
+ <a href="https://doi.org/10.1038/nn1136" class="reference__title">EXP-1 is an excitatory GABA-gated cation channel</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:AA+Beg%22" class="reference__authors_link">AA Beg</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:EM+Jorgensen%22" class="reference__authors_link">EM Jorgensen</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2003)</span>
+
+ <div class="reference__origin"><i>Nature Neuroscience</i> <b>6</b>:1145–1152.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1038/nn1136" class="doi__link">https://doi.org/10.1038/nn1136</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/14555952" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=EXP-1+is+an+excitatory+GABA-gated+cation+channel&amp;author=AA+Beg&amp;author=EM+Jorgensen&amp;publication_year=2003&amp;journal=Nature+Neuroscience&amp;volume=6&amp;pages=pp.+1145%E2%80%931152&amp;pmid=14555952" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">5</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib5" id="bib5">
+
+
+ <a href="https://doi.org/10.1111/j.1460-9568.2005.04420.x" class="reference__title">GABAergic synaptic transmission modulates swimming in the ascidian larva</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:ER+Brown%22" class="reference__authors_link">ER Brown</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:A+Nishino%22" class="reference__authors_link">A Nishino</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Q+Bone%22" class="reference__authors_link">Q Bone</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:IA+Meinertzhagen%22" class="reference__authors_link">IA Meinertzhagen</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Y+Okamura%22" class="reference__authors_link">Y Okamura</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2005)</span>
+
+ <div class="reference__origin"><i>European Journal of Neuroscience</i> <b>22</b>:2541–2548.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1111/j.1460-9568.2005.04420.x" class="doi__link">https://doi.org/10.1111/j.1460-9568.2005.04420.x</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/16307596" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=GABAergic+synaptic+transmission+modulates+swimming+in+the+ascidian+larva&amp;author=ER+Brown&amp;author=A+Nishino&amp;author%5B2%5D=Q+Bone&amp;author%5B3%5D=IA+Meinertzhagen&amp;author%5B4%5D=Y+Okamura&amp;publication_year=2005&amp;journal=European+Journal+of+Neuroscience&amp;volume=22&amp;pages=pp.+2541%E2%80%932548&amp;pmid=16307596" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">49</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib49" id="bib49">
+
+
+ <a href="https://doi.org/10.1016/S0034-4257(97)00083-7" class="reference__title">Selecting and interpreting measures of thematic classification accuracy</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:SV+Stehman%22" class="reference__authors_link">SV Stehman</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(1997)</span>
+
+ <div class="reference__origin"><i>Remote Sensing of Environment</i> <b>62</b>:77–89.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1016/S0034-4257(97)00083-7" class="doi__link">https://doi.org/10.1016/S0034-4257(97)00083-7</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Selecting+and+interpreting+measures+of+thematic+classification+accuracy&amp;author=SV+Stehman&amp;publication_year=1997&amp;journal=Remote+Sensing+of+Environment&amp;volume=62&amp;pages=pp.+77%E2%80%9389" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">50</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib50" id="bib50">
+
+
+ <a href="https://doi.org/10.2307/1542300" class="reference__title">Ciliary Hovering in Larval Lancelets (=Amphioxus)</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:MD+Stokes%22" class="reference__authors_link">MD Stokes</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:ND+Holland%22" class="reference__authors_link">ND Holland</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(1995)</span>
+
+ <div class="reference__origin"><i>The Biological Bulletin</i> <b>188</b>:231–233.</div>
+
+ <span class="doi"><a href="https://doi.org/10.2307/1542300" class="doi__link">https://doi.org/10.2307/1542300</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/29281329" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Ciliary+Hovering+in+Larval+Lancelets+%28%3DAmphioxus%29&amp;author=MD+Stokes&amp;author=ND+Holland&amp;publication_year=1995&amp;journal=The+Biological+Bulletin&amp;volume=188&amp;pages=pp.+231%E2%80%93233&amp;pmid=29281329" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">51</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib51" id="bib51">
+
+
+ <a href="https://doi.org/10.1002/cne.23679" class="reference__title">A comparative examination of neural circuit and brain patterning between the lamprey and amphioxus reveals the evolutionary origin of the vertebrate visual center</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:DG+Suzuki%22" class="reference__authors_link">DG Suzuki</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Y+Murakami%22" class="reference__authors_link">Y Murakami</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:H+Escriva%22" class="reference__authors_link">H Escriva</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:H+Wada%22" class="reference__authors_link">H Wada</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2015)</span>
+
+ <div class="reference__origin"><i>Journal of Comparative Neurology</i> <b>523</b>:251–261.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1002/cne.23679" class="doi__link">https://doi.org/10.1002/cne.23679</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/25233869" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=A+comparative+examination+of+neural+circuit+and+brain+patterning+between+the+lamprey+and+amphioxus+reveals+the+evolutionary+origin+of+the+vertebrate+visual+center&amp;author=DG+Suzuki&amp;author=Y+Murakami&amp;author%5B2%5D=H+Escriva&amp;author%5B3%5D=H+Wada&amp;publication_year=2015&amp;journal=Journal+of+Comparative+Neurology&amp;volume=523&amp;pages=pp.+251%E2%80%93261&amp;pmid=25233869" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">52</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib52" id="bib52">
+
+
+ <a href="https://doi.org/10.1007/s00427-001-0205-0" class="reference__title">Developmental expression of ascidian neurotransmitter synthesis genes. I. choline acetyltransferase and acetylcholine transporter genes</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:K+Takamura%22" class="reference__authors_link">K Takamura</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Egawa%22" class="reference__authors_link">T Egawa</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Ohnishi%22" class="reference__authors_link">S Ohnishi</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Okada%22" class="reference__authors_link">T Okada</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Fukuoka%22" class="reference__authors_link">T Fukuoka</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2002)</span>
+
+ <div class="reference__origin"><i>Development Genes and Evolution</i> <b>212</b>:50–53.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1007/s00427-001-0205-0" class="doi__link">https://doi.org/10.1007/s00427-001-0205-0</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/11875658" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Developmental+expression+of+ascidian+neurotransmitter+synthesis+genes.+I.+choline+acetyltransferase+and+acetylcholine+transporter+genes&amp;author=K+Takamura&amp;author=T+Egawa&amp;author%5B2%5D=S+Ohnishi&amp;author%5B3%5D=T+Okada&amp;author%5B4%5D=T+Fukuoka&amp;publication_year=2002&amp;journal=Development+Genes+and+Evolution&amp;volume=212&amp;pages=pp.+50%E2%80%9353&amp;pmid=11875658" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">53</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib53" id="bib53">
+
+
+ <a href="https://doi.org/10.2108/zsj.27.191" class="reference__title">Neural map of the larval central nervous system in the ascidian Ciona intestinalis</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:K+Takamura%22" class="reference__authors_link">K Takamura</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:N+Minamida%22" class="reference__authors_link">N Minamida</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Okabe%22" class="reference__authors_link">S Okabe</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2010)</span>
+
+ <div class="reference__origin"><i>Zoological Science</i> <b>27</b>:191–203.</div>
+
+ <span class="doi"><a href="https://doi.org/10.2108/zsj.27.191" class="doi__link">https://doi.org/10.2108/zsj.27.191</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/20141424" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Neural+map+of+the+larval+central+nervous+system+in+the+ascidian+Ciona+intestinalis&amp;author=K+Takamura&amp;author=N+Minamida&amp;author%5B2%5D=S+Okabe&amp;publication_year=2010&amp;journal=Zoological+Science&amp;volume=27&amp;pages=pp.+191%E2%80%93203&amp;pmid=20141424" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">54</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib54" id="bib54">
+
+
+ <a href="https://doi.org/10.1007/978-1-61779-210-6_15" class="reference__title">Ciona genetics</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:MT+Veeman%22" class="reference__authors_link">MT Veeman</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Chiba%22" class="reference__authors_link">S Chiba</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:WC+Smith%22" class="reference__authors_link">WC Smith</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2011)</span>
+
+ <div class="reference__origin"><i>Methods in Molecular Biology</i> <b>770</b>:401–422.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1007/978-1-61779-210-6_15" class="doi__link">https://doi.org/10.1007/978-1-61779-210-6_15</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/21805273" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Ciona+genetics&amp;author=MT+Veeman&amp;author=S+Chiba&amp;author%5B2%5D=WC+Smith&amp;publication_year=2011&amp;journal=Methods+in+Molecular+Biology&amp;volume=770&amp;pages=pp.+401%E2%80%93422&amp;pmid=21805273" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">55</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib55" id="bib55">
+
+
+ <a href="https://doi.org/10.1073/pnas.1207580109" class="reference__title">Molecular analysis of the amphioxus frontal eye unravels the evolutionary origin of the retina and pigment cells of the vertebrate eye</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:P+Vopalensky%22" class="reference__authors_link">P Vopalensky</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:J+Pergner%22" class="reference__authors_link">J Pergner</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Liegertova%22" class="reference__authors_link">M Liegertova</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:E+Benito-Gutierrez%22" class="reference__authors_link">E Benito-Gutierrez</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Arendt%22" class="reference__authors_link">D Arendt</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Z+Kozmik%22" class="reference__authors_link">Z Kozmik</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2012)</span>
+
+ <div class="reference__origin"><i>PNAS</i> <b>109</b>:15383–15388.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1073/pnas.1207580109" class="doi__link">https://doi.org/10.1073/pnas.1207580109</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/22949670" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Molecular+analysis+of+the+amphioxus+frontal+eye+unravels+the+evolutionary+origin+of+the+retina+and+pigment+cells+of+the+vertebrate+eye&amp;author=P+Vopalensky&amp;author=J+Pergner&amp;author%5B2%5D=M+Liegertova&amp;author%5B3%5D=E+Benito-Gutierrez&amp;author%5B4%5D=D+Arendt&amp;author%5B5%5D=Z+Kozmik&amp;publication_year=2012&amp;journal=PNAS&amp;volume=109&amp;pages=pp.+15383%E2%80%9315388&amp;pmid=22949670" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">56</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib56" id="bib56">
+
+
+ <a href="https://doi.org/10.1002/gene.20032" class="reference__title">Identification of neuron-specific promoters in Ciona intestinalis</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:R+Yoshida%22" class="reference__authors_link">R Yoshida</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Sakurai%22" class="reference__authors_link">D Sakurai</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Horie%22" class="reference__authors_link">T Horie</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:I+Kawakami%22" class="reference__authors_link">I Kawakami</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Tsuda%22" class="reference__authors_link">M Tsuda</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Kusakabe%22" class="reference__authors_link">T Kusakabe</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2004)</span>
+
+ <div class="reference__origin"><i>Genesis</i> <b>39</b>:130–140.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1002/gene.20032" class="doi__link">https://doi.org/10.1002/gene.20032</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/15170699" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Identification+of+neuron-specific+promoters+in+Ciona+intestinalis&amp;author=R+Yoshida&amp;author=D+Sakurai&amp;author%5B2%5D=T+Horie&amp;author%5B3%5D=I+Kawakami&amp;author%5B4%5D=M+Tsuda&amp;author%5B5%5D=T+Kusakabe&amp;publication_year=2004&amp;journal=Genesis&amp;volume=39&amp;pages=pp.+130%E2%80%93140&amp;pmid=15170699" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">57</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib57" id="bib57">
+
+
+ <a href="https://doi.org/10.1242/jeb.012864" class="reference__title">Shadow response in the blind cavefish Astyanax reveals conservation of a functional pineal eye</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Yoshizawa%22" class="reference__authors_link">M Yoshizawa</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:WR+Jeffery%22" class="reference__authors_link">WR Jeffery</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2008)</span>
+
+ <div class="reference__origin"><i>Journal of Experimental Biology</i> <b>211</b>:292–299.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1242/jeb.012864" class="doi__link">https://doi.org/10.1242/jeb.012864</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/18203983" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Shadow+response+in+the+blind+cavefish+Astyanax+reveals+conservation+of+a+functional+pineal+eye&amp;author=M+Yoshizawa&amp;author=WR+Jeffery&amp;publication_year=2008&amp;journal=Journal+of+Experimental+Biology&amp;volume=211&amp;pages=pp.+292%E2%80%93299&amp;pmid=18203983" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">58</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib58" id="bib58">
+
+
+ <a href="https://doi.org/10.1523/JNEUROSCI.0141-10.2010" class="reference__title">Synaptic and vesicular coexistence of VGLUT and VGAT in selected excitatory and inhibitory synapses</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:JF+Zander%22" class="reference__authors_link">JF Zander</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:A+M%C3%BCnster-Wandowski%22" class="reference__authors_link">A Münster-Wandowski</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:I+Brunk%22" class="reference__authors_link">I Brunk</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:I+Pahner%22" class="reference__authors_link">I Pahner</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+G%C3%B3mez-Lira%22" class="reference__authors_link">G Gómez-Lira</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:U+Heinemann%22" class="reference__authors_link">U Heinemann</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:R+Guti%C3%A9rrez%22" class="reference__authors_link">R Gutiérrez</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+Laube%22" class="reference__authors_link">G Laube</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+Ahnert-Hilger%22" class="reference__authors_link">G Ahnert-Hilger</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2010)</span>
+
+ <div class="reference__origin"><i>Journal of Neuroscience</i> <b>30</b>:7634–7645.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1523/JNEUROSCI.0141-10.2010" class="doi__link">https://doi.org/10.1523/JNEUROSCI.0141-10.2010</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/20519538" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Synaptic+and+vesicular+coexistence+of+VGLUT+and+VGAT+in+selected+excitatory+and+inhibitory+synapses&amp;author=JF+Zander&amp;author=A+M%C3%BCnster-Wandowski&amp;author%5B2%5D=I+Brunk&amp;author%5B3%5D=I+Pahner&amp;author%5B4%5D=G+G%C3%B3mez-Lira&amp;author%5B5%5D=U+Heinemann&amp;author%5B6%5D=R+Guti%C3%A9rrez&amp;author%5B7%5D=G+Laube&amp;author%5B8%5D=G+Ahnert-Hilger&amp;publication_year=2010&amp;journal=Journal+of+Neuroscience&amp;volume=30&amp;pages=pp.+7634%E2%80%937645&amp;pmid=20519538" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">59</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib59" id="bib59">
+
+
+ <a href="https://doi.org/10.1002/cne.21565" class="reference__title">Developmental expression of glutamic acid decarboxylase and of gamma-aminobutyric acid type B receptors in the ascidian Ciona intestinalis</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+Zega%22" class="reference__authors_link">G Zega</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Biggiogero%22" class="reference__authors_link">M Biggiogero</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Groppelli%22" class="reference__authors_link">S Groppelli</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Candiani%22" class="reference__authors_link">S Candiani</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Oliveri%22" class="reference__authors_link">D Oliveri</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Parodi%22" class="reference__authors_link">M Parodi</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Pestarino%22" class="reference__authors_link">M Pestarino</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:F+De+Bernardi%22" class="reference__authors_link">F De Bernardi</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:R+Pennati%22" class="reference__authors_link">R Pennati</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2008)</span>
+
+ <div class="reference__origin"><i>The Journal of Comparative Neurology</i> <b>506</b>:489–505.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1002/cne.21565" class="doi__link">https://doi.org/10.1002/cne.21565</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/18041772" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Developmental+expression+of+glutamic+acid+decarboxylase+and+of+gamma-aminobutyric+acid+type+B+receptors+in+the+ascidian+Ciona+intestinalis&amp;author=G+Zega&amp;author=M+Biggiogero&amp;author%5B2%5D=S+Groppelli&amp;author%5B3%5D=S+Candiani&amp;author%5B4%5D=D+Oliveri&amp;author%5B5%5D=M+Parodi&amp;author%5B6%5D=M+Pestarino&amp;author%5B7%5D=F+De+Bernardi&amp;author%5B8%5D=R+Pennati&amp;publication_year=2008&amp;journal=The+Journal+of+Comparative+Neurology&amp;volume=506&amp;pages=pp.+489%E2%80%93505&amp;pmid=18041772" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">60</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib60" id="bib60">
+
+
+ <a href="https://doi.org/10.1007/978-981-10-7545-2_5" class="reference__title">Electroporation in ascidians: history, theory and protocols</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:RW+Zeller%22" class="reference__authors_link">RW Zeller</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2018)</span>
+
+ <div class="reference__origin"><i>Advances in Experimental Medicine and Biology</i> <b>1029</b>:37–48.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1007/978-981-10-7545-2_5" class="doi__link">https://doi.org/10.1007/978-981-10-7545-2_5</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/29542079" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Electroporation+in+ascidians%3A+history%2C+theory+and+protocols&amp;author=RW+Zeller&amp;publication_year=2018&amp;journal=Advances+in+Experimental+Medicine+and+Biology&amp;volume=1029&amp;pages=pp.+37%E2%80%9348&amp;pmid=29542079" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+</ol>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="SA1"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Decision letter</h2>
+ </header>
+
+ <div class="article-section__body">
+ <div class="decision-letter-header">
+ <ol class="listing-list">
+ <li class="listing-list__item">
+ <div class="profile-snippet">
+ <div class="profile-snippet__container clearfix">
+
+ <div class="profile-snippet__name">Oliver Hobert</div>
+ <div class="profile-snippet__title">Reviewing Editor; Howard Hughes Medical Institute, Columbia University, United States</div>
+ </div>
+ </div>
+ </li>
+ <li class="listing-list__item">
+ <div class="profile-snippet">
+ <div class="profile-snippet__container clearfix">
+
+ <div class="profile-snippet__name">Ronald L Calabrese</div>
+ <div class="profile-snippet__title">Senior Editor; Emory University, United States</div>
+ </div>
+ </div>
+ </li>
+ </ol>
+ <div class="decision-letter-header__main_text"><p class="paragraph">In the interests of transparency, eLife includes the editorial decision letter and accompanying author responses. A lightly edited version of the letter sent to the authors after peer review is shown, indicating the most substantive concerns; minor comments are not usually included.</p>
+</div>
+</div>
+<p class="paragraph">Thank you for submitting your article "Parallel Visual Circuitry in a Basal Chordate" for consideration by <i>eLife</i>. Your article has been reviewed by Ronald Calabrese as the Senior Editor, a Reviewing Editor, Oliver Hobert, and two reviewers. The reviewers have opted to remain anonymous.</p>
+<p class="paragraph">The reviewers have discussed the reviews with one another and the Reviewing Editor has drafted this decision to help you prepare a revised submission.</p>
+<p class="paragraph">The reviewers – and the Reviewing Editor – agree that the manuscript reports an interesting, exciting set of findings that provide new insight into how visual systems evolve. However, there is also agreement that the evidence behind the GABA receptors being involved in the behavioral response to dimming is entirely indirect, and would be substantially strengthened by a pharmacological parallel to the Glutamate receptor antagonist data. That is, according to the disinhibition model, acute blockade of GABA(A) receptors with a pharmacological antagonist should produce a "hyperactive" movement phenotype akin to the <i>frm</i> mutant animal, but one that should still be capable of phototaxis (but not a dimming response). Such a result would provide an elegant "double dissociation" that would parallel the findings with the AMPA receptor antagonist.</p>
+<p class="paragraph">There is also agreement that the manuscript requires an extensive revision to the Introduction that puts the work in a broader context. At present, the manuscript begins largely with a description of the <i>Ciona</i> connectome, in relation to other complete connectomes, and then plunges directly into a more detailed description of ganglia, cells and synapses. A broader audience could be engaged by the work if the authors identified the key question of interest, and provides some of the background material currently found in the Discussion section, before diving into the pertinent details.</p>
+<p class="paragraph"><i>Reviewer #1:</i></p>
+<p class="paragraph">How the functional architecture of visual systems has evolved to subserve different behavioral goals is a fundamental question of broad interest. At present, while we have a deep understanding of visual system organization in a few experimental models, such a fundamental question can be enriched through the exploration of evolutionarily divergent organisms. In this context, Smith and colleagues integrate a new description of neurotransmitter expression patterns, ultrastructural connectivity, pharmacology and behavior to derive new insights into the architecture of the Ascidian <i>Ciona</i> visual system.</p>
+<p class="paragraph">First, by mapping RNA expression patterns onto neurons spanning the <i>Ciona</i> nervous system using a combination of HCR in situs and image registration, they assign neurotransmitter types to many neurons. Importantly, these studies reveal three classes of ocellus photoreceptors – one that uses glutamate as a transmitter, one that uses GABA, and one that appears to release both. Next, using a glutamate receptor antagonist, they demonstrate that blockade of signaling from glutamatergic photoreceptors blocks phototaxis, but does not affect a second behavior evoked by transient dimming. Finally, consistent with the idea that a subset of photoreceptors could control the dimming response by depolarizing to darkness, and releasing GABA, the authors describe a mutant in which visual input to motor pathways is disrupted, leading to an animal that swims constitutively.</p>
+<p class="paragraph">Overall, this manuscript reports an interesting, exciting set of findings that provide new insight into how visual systems evolve. I find the idea that there might be photoreceptors that appear to hyperpolarize to light and release GABA particularly exciting, and it will be fascinating to learn more about how these photoreceptors are related to retinal and pineal photoreceptors in vertebrates. However, I do feel that the evidence behind these receptors being involved in the behavioral response to dimming is entirely indirect, and would be substantially strengthened by a pharmacological parallel to the Glutamate receptor antagonist data. That is, according to the disinhibition model, acute blockade of GABA(A) receptors with a pharmacological antagonist should produce a "hyperactive" movement phenotype akin to the <i>frm</i> mutant animal, but one that should still be capable of phototaxis (but not a dimming response). Such a result would provide an elegant "double dissociation" that would parallel the findings with the AMPA receptor antagonist.</p>
+<p class="paragraph"><i>Reviewer #2:</i></p>
+<p class="paragraph">The fact that there is a full map of connections in <i>Ciona</i> provides a great opportunity to dissect circuits. Even better, the tools are there to perform some genetic and pharmacological pertubations, and evaluate effects on behavior. This study begins to exploit these features in a study of the <i>Ciona</i> visual system. The authors dug deeper into two circuits that begin with photoreception. They used transgenic reporter animals and in situ hybridization to define the use of two classical neurotransmitters, glutamate and GABA. Surprisingly, one type of photoreceptor uses GABA, an inhibitory neurotransmitter not previously described as used by photoreceptors in any species. From the known connections, they also make a case for how the two circuits are connected, and further suggest that one of the circuits is disinhibitory, perhaps along with other sensory inputs, for oscillatory swimming behavior. Through the use of a specific antagonist for a glutamate receptor they are able to show that one of the photoreceptor circuits is involved with detection of the direction of light (phototaxis), using a behavioral assay. Interestingly, inhibition of phototaxis has no effect on the other circuit, which detects dimming. However, it is likely that there is cross talk between the two photoreceptor circuits, as suggested by the known anatomy.</p>
+<p class="paragraph">Overall this study provides a very nice example of photoreceptor directed behavior as controlled by two different circuits. It provides food for though regarding the evolution of different types of visually guided behaviors and the use of different types of photoreceptors. Optogenetic manipulations and calcium imaging (tried by the authors but did not work due to technical limitations) would greatly add to this story, but as it stands it constitutes a very nice addition to our understanding of a sensory circuit and behavior.</p>
+
+
+
+
+ <span class="doi doi--article-section"><a href="https://doi.org/10.7554/eLife.44753.036" class="doi__link">https://doi.org/10.7554/eLife.44753.036</a></span>
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="SA2"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Author response</h2>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph"><i>The reviewers – and the reviewing editor – agree that the manuscript reports an interesting, exciting set of findings that provide new insight into how visual systems evolve. […] A broader audience could be engaged by the work if the authors identified the key question of interest, and provides some of the background material currently found in the Discussion section, before diving into the pertinent details.</i></p>
+<p class="paragraph">In our revised manuscript we have thoroughly addressed the reviewers concerns and have included extensive new data from behavioral studies using a GABA receptor antagonist (Figure 7 in the revised manuscript, and related text). As you will read in the text, our results with the GABA receptor antagonist (picrotoxin) agree thoroughly with our disinhibition model (and with our observations of the <i>frm</i> mutant). The use of the GABA receptor antagonist was an excellent suggestion, and we feel that the results presented here greatly strengthen our model. As you will see in Figure 7, picrotoxin (like the <i>frm</i> mutant) leads to increased spontaneous swimming. Moreover, picrotoxin also leads to a dramatic reduction in the dimming response. We then show with use of picrotoxin combined with the AMPAR antagonist perampanel that the residual dimming response is due to parallel activation of the excitatory circuit. Finally, we show that picrotoxin-treated larvae are still capable of phototaxis. However, we observed that the phototaxis ability of the picrotoxin-treated larvae was somewhat dampened in comparison to controls, which we attribute to excitotoxicity of prolonged picrotoxin exposure (Movie5 documents the toxicity of prolonged picrotoxin exposure).</p>
+<p class="paragraph">We have also extensively rewritten the Introduction along the lines suggested by the reviewer. Additionally, as requested, we have included in the text the number of animals tested using the pOpsin1/VGAT Kaede combination (n=5). Finally, we have collected additional data on neurotransmitter use by cells of the motor ganglion. These additional data are presented in revised versions of Figure 4 and Figure 4-figure supplement 1. Our conclusions regarding the minimal circuit are unchanged by this additional data; however, we are revising our neurotransmitter assignment to the anterior pair of ACINs. This reassignment was undertaken after consultation with Kerrianne Ryan (author of the <i>Ciona</i> connectome manuscript). We also include an approved personal communication from Dr. Ryan in this section.</p>
+
+
+
+
+ <span class="doi doi--article-section"><a href="https://doi.org/10.7554/eLife.44753.037" class="doi__link">https://doi.org/10.7554/eLife.44753.037</a></span>
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="info"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Article and author information</h2>
+ </header>
+
+ <div class="article-section__body">
+ <h3 class="authors-details__heading">Author details</h3>
+<ol class="authors-details__authors">
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x8d8d9914" id="x8d8d9914">
+
+ <h4 class="author-details__name">Matthew J Kourakis</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Data curation, Formal analysis, Supervision, Investigation, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contributed equally with</h5>
+ <span class="author-details__text">Cezar Borba</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+ <section class="author-details__section">
+ <span class="orcid">
+ <a href="https://orcid.org/0000-0002-1261-3811">
+ <picture>
+ <source srcset="/assets/patterns/img/icons/orcid.b96370b9.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/orcid.10f6112b.png" class="orcid__icon"
+ alt="ORCID icon">
+ </picture> <span class="visuallyhidden">"This ORCID iD identifies the author of this article:"</span>
+ 0000-0002-1261-3811</a>
+ </span>
+ </section>
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="xf3e51472" id="xf3e51472">
+
+ <h4 class="author-details__name">Cezar Borba</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Software, Formal analysis, Investigation, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contributed equally with</h5>
+ <span class="author-details__text">Matthew J Kourakis</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="xb536f34f" id="xb536f34f">
+
+ <h4 class="author-details__name">Angela Zhang</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Department of Electrical and Computer Engineering, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Software, Formal analysis, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x1d85dfc3" id="x1d85dfc3">
+
+ <h4 class="author-details__name">Erin Newman-Smith</h4>
+
+ <section class="author-details__section">
+ <ol class="author-details__list list list--bullet">
+ <li class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</li>
+ <li class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</li>
+ </ol>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Conceptualization, Formal analysis, Investigation, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x6107dd5d" id="x6107dd5d">
+
+ <h4 class="author-details__name">Priscilla Salas</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Investigation, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x8b937bbf" id="x8b937bbf">
+
+ <h4 class="author-details__name">B Manjunath</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Conceptualization, Supervision, Funding acquisition, Project administration, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="xa3814a31" id="xa3814a31">
+
+ <h4 class="author-details__name">William C Smith</h4>
+
+ <section class="author-details__section">
+ <ol class="author-details__list list list--bullet">
+ <li class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</li>
+ <li class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</li>
+ </ol>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Conceptualization, Funding acquisition, Writing—original draft, Project administration</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">For correspondence</h5>
+ <span class="author-details__text"><a href="mailto:w_smith@ucsb.edu">w_smith@ucsb.edu</a></span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+ <section class="author-details__section">
+ <span class="orcid">
+ <a href="https://orcid.org/0000-0002-6257-7695">
+ <picture>
+ <source srcset="/assets/patterns/img/icons/orcid.b96370b9.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/orcid.10f6112b.png" class="orcid__icon"
+ alt="ORCID icon">
+ </picture> <span class="visuallyhidden">"This ORCID iD identifies the author of this article:"</span>
+ 0000-0002-6257-7695</a>
+ </span>
+ </section>
+
+
+</div>
+
+</li>
+</ol>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Funding</h3>
+ </header>
+
+ <div class="article-section__body">
+ <section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">National Institute of Neurological Disorders and Stroke (R01NS103774)</h4>
+ </header>
+
+ <div class="article-section__body">
+
+
+ <ul class="list list--bullet">
+ <li>William C Smith</li>
+ </ul>
+
+
+
+
+
+ </div>
+
+</section>
+<p class="paragraph">The funders had no role in study design, data collection and interpretation, or the decision to submit the work for publication.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Acknowledgements</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">We thank Takeo Horie and Takahiro Kusakabe for the opsin1 promoter construct; Yasunori Sasakura for the stable pVGAT &gt;kaede line and pVACHT &gt;CFP plasmid; Haruo Okado for the pAMPAR &gt;GFP construct. Kerrianne Ryan for her helpful discussion and sharing unpublished data. Chelsea Parlett-Pelleriti for her advice on statistical analysis. We acknowledge the use of the NRI-MCDB Microscopy Facility and the Resonant Scanning Confocal supported by NSF MRI grant 1625770. This work supported by an award from NIH (NS103774) to WCS and BM.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Senior Editor</h3>
+ </header>
+
+ <div class="article-section__body">
+
+ <ol class="list">
+ <li>Ronald L Calabrese, Emory University, United States</li>
+ </ol>
+
+
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Reviewing Editor</h3>
+ </header>
+
+ <div class="article-section__body">
+
+ <ol class="list">
+ <li>Oliver Hobert, Howard Hughes Medical Institute, Columbia University, United States</li>
+ </ol>
+
+
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Publication history</h3>
+ </header>
+
+ <div class="article-section__body">
+
+ <ol class="list list--bullet">
+ <li>Received: December 28, 2018</li>
+ <li>Accepted: April 11, 2019</li>
+ <li>Accepted Manuscript published: <a href="/articles/44753v1">April 18, 2019 (version 1)</a></li>
+ <li>Version of Record published: <a href="/articles/44753">May 3, 2019 (version 2)</a></li>
+ </ol>
+
+
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Copyright</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p>© 2019, Kourakis et al.</p><p>This article is distributed under the terms of the <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</a>, which permits unrestricted use and redistribution provided that the original author and source are credited.</p>
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="metrics"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Metrics</h2>
+ </header>
+
+ <div class="article-section__body">
+ <ul class="statistic-collection clearfix">
+ <li class="statistic-collection__item">
+ <dl class="statistic">
+ <dd class="statistic__value">
+ 807
+ </dd>
+ <dt class="statistic__label">
+ Page views
+ </dt>
+ </dl>
+ </li>
+ <li class="statistic-collection__item">
+ <dl class="statistic">
+ <dd class="statistic__value">
+ 173
+ </dd>
+ <dt class="statistic__label">
+ Downloads
+ </dt>
+ </dl>
+ </li>
+ <li class="statistic-collection__item">
+ <dl class="statistic">
+ <dd class="statistic__value">
+ 0
+ </dd>
+ <dt class="statistic__label">
+ Citations
+ </dt>
+ </dl>
+ </li>
+</ul>
+<p class="paragraph">Article citation count generated by polling the highest count across the following sources: <a href="">Crossref</a>, <a href="">PubMed Central</a>, <a href="">Scopus</a>.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Download links</h2>
+ </header>
+
+ <div class="article-section__body">
+ <div data-behaviour="ArticleDownloadLinksList" id="downloads" aria-labelledby="downloads-label">
+ <div class="visuallyhidden"><span id="downloads-label">A two-part list of links to download the article, or parts of the article, in various formats.</span></div>
+
+ <h3 class="article-download-links-list__heading">Downloads<span class="visuallyhidden"> (link to download the article as PDF)</span></h3>
+ <ul class="article-download-list">
+ <li><a href="https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D" class="article-download-links-list__link"
+
+ data-article-identifier="10.7554/eLife.44753"
+ data-download-type="pdf-article"
+
+ >Article PDF</a></li>
+ <li><a href="https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtZmlndXJlcy12Mi5wZGY=/elife-44753-figures-v2.pdf?_hash=x4qA8GP%2BKBA2SOVJsL3falcqZCUNChW4fqaIfFIjgHk%3D" class="article-download-links-list__link"
+
+ data-article-identifier="10.7554/eLife.44753"
+ data-download-type="pdf-figures"
+
+ >Figures PDF</a></li>
+ </ul>
+ <h3 class="article-download-links-list__heading">Download citations<span class="visuallyhidden"> (links to download the citations from this article in formats compatible with various reference manager tools)</span></h3>
+ <ul class="article-download-list">
+ <li><a href="/articles/44753.bib" class="article-download-links-list__link"
+
+
+ >BibTeX</a></li>
+ <li><a href="/articles/44753.ris" class="article-download-links-list__link"
+
+
+ >RIS</a></li>
+ </ul>
+ <h3 class="article-download-links-list__heading">Open citations<span class="visuallyhidden"> (links to open the citations from this article in various online reference manager services)</span></h3>
+ <ul class="article-download-list">
+ <li><a href="https://www.mendeley.com/import?doi=10.7554/eLife.44753" class="article-download-links-list__link"
+
+
+ >Mendeley</a></li>
+ <li><a href="https://www.readcube.com/articles/10.7554/eLife.44753" class="article-download-links-list__link"
+
+
+ >ReadCube</a></li>
+ <li><a href="papers2://url/https%3A%2F%2Felifesciences.org%2Farticles%2F44753?title=Parallel+visual+circuitry+in+a+basal+chordate" class="article-download-links-list__link"
+
+
+ >Papers</a></li>
+ <li><a href="http://www.citeulike.org/posturl?url=https%3A%2F%2Felifesciences.org%2Farticles%2F44753&amp;title=Parallel+visual+circuitry+in+a+basal+chordate&amp;doi=10.7554/eLife.44753" class="article-download-links-list__link"
+
+
+ >CiteULike</a></li>
+ </ul>
+
+</div>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+<section class="article-meta">
+
+ <div class="article-meta__container">
+
+
+ <section class="article-meta__group">
+ <h4 class="article-meta__group_title">Categories and tags</h4>
+ <ul class="article-meta__link_list">
+ <li class="article-meta__link_list_item">
+ <a href="/articles/research-article" class="article-meta__link">Research Article</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/subjects/neuroscience" class="article-meta__link">Neuroscience</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=connectome" class="article-meta__link">connectome</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=visuomotor" class="article-meta__link">visuomotor</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=behavior" class="article-meta__link">behavior</a></li>
+ </ul>
+ </section>
+
+
+ <section class="article-meta__group">
+ <h4 class="article-meta__group_title">Research organism</h4>
+ <ul class="article-meta__link_list">
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=C.%20intestinalis" class="article-meta__link"><i>C. intestinalis</i></a></li>
+ </ul>
+ </section>
+
+
+ </div>
+
+</section>
+
+
+
+
+
+
+
+ </div>
+
+
+ <div class="grid__item one-whole
+
+ large--four-twelfths x-large--three-twelfths
+ grid-secondary-column">
+
+ <div class="grid-secondary-column__item grid-secondary-column__item--wide-only">
+
+ <div>
+
+
+ <ol class="listing-list ">
+ <li class="listing-list__item"><div class="teaser teaser--secondary teaser--related ">
+
+ <ol class="teaser__context_label_list" aria-label="These research categories are for the following article">
+ <li class="teaser__context_label_item">
+
+ <span class="teaser__context_label">Of interest</span>
+ </li>
+ </ol>
+
+ <header class="teaser__header">
+
+
+ <h4 class="teaser__header_text">
+ <a href="/articles/48779" class="teaser__header_text_link">An arbitrary-spectrum spatial visual stimulator for vision research</a>
+ </h4>
+
+ <div class="teaser__secondary_info">
+ Katrin Franke et al.
+ </div>
+
+ </header>
+
+
+ <footer class="teaser__footer">
+
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/tools-resources" >Tools and Resources</a>
+
+
+
+ <span class="date"> Updated <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+
+
+ </footer>
+</div>
+</li><li class="listing-list__item"><a href="#listing" class="see-more-link">Further reading</a>
+</li></ol>
+
+
+</div>
+
+
+ </div>
+
+ </div>
+
+
+ </div>
+
+</div>
+
+
+ <div class="wrapper listing-read-more">
+
+ <div class="grid">
+
+ <div class="content-container grid__item
+ one-whole
+ large--ten-twelfths
+ push--large--one-twelfth
+ x-large--eight-twelfths
+ push--x-large--two-twelfths
+ grid-column">
+
+ <div class="listing-list-heading">
+ <h3 class="list-heading">Further reading</h3>
+ </div>
+
+ <ol class="listing-list listing-list--read-more" id="listing">
+ <li class="listing-list__item listing-list__item--related">
+ <div class="listing-list__divider"></div>
+ <header class="content-header content-header--read-more clearfix content-header--header">
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <span class="content-header__subject">Neuroscience</span>
+ </li>
+ </ol>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--long">
+ <a href="/articles/48779" class="content-header__title_link">An arbitrary-spectrum spatial visual stimulator for vision research</a>
+ </h1>
+ </div>
+
+ <div class="content-header__authors content-header__authors--line">Katrin Franke et al.</div>
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/tools-resources" >Tools and Resources</a>
+
+
+
+ <span class="date"> Updated <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+ </div>
+
+ </header>
+ </li>
+ <li class="listing-list__item ">
+ <div class="listing-list__divider"></div>
+ <header class="content-header content-header--read-more clearfix content-header--header">
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <span class="content-header__subject">Neuroscience</span>
+ </li>
+ </ol>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--long">
+ <a href="/articles/47996" class="content-header__title_link">Self-organization of modular network architecture by activity-dependent neuronal migration and outgrowth</a>
+ </h1>
+ </div>
+
+ <div class="content-header__authors content-header__authors--line">Samora Okujeni, Ulrich Egert</div>
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/research-article" >Research Article</a>
+
+
+
+ <span class="date"> Updated <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+ </div>
+
+ </header>
+ </li>
+ <li class="listing-list__item ">
+ <div class="listing-list__divider"></div>
+ <header class="content-header content-header--read-more clearfix content-header--header">
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <span class="content-header__subject">Neuroscience</span>
+ </li>
+ </ol>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--long">
+ <a href="/articles/48114" class="content-header__title_link">Pretectal neurons control hunting behaviour</a>
+ </h1>
+ </div>
+
+ <div class="content-header__authors content-header__authors--line">Paride Antinucci et al.</div>
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/research-article" >Research Article</a>
+
+
+
+ <span class="date"> <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+ </div>
+
+ </header>
+ </li>
+
+ </ol>
+
+
+ </div>
+
+ </div>
+
+</div>
+
+
+
+
+ </div>
+
+
+ </main>
+
+ <section class="email-cta">
+
+</section>
+
+
+ <div class="main-menu" id="mainMenu" data-behaviour="MainMenu" tabindex="0">
+ <nav class="main-menu__container" role="navigation">
+ <h3 class="list-heading">Menu</h3>
+ <ul class="main-menu__list">
+ <li class="main-menu__list_item">
+ <a href="/subjects" class="main-menu__list_link">Research categories</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="https://submit.elifesciences.org/html/elife_author_instructions.html" class="main-menu__list_link">Author guide</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="https://submit.elifesciences.org/html/elife_reviewer_instructions.html" class="main-menu__list_link">Reviewer guide</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/about" class="main-menu__list_link">About</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/inside-elife" class="main-menu__list_link">Inside eLife</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/community" class="main-menu__list_link">Community</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/labs" class="main-menu__list_link">Innovation</a>
+ </li>
+ </ul>
+ <a href="#siteHeader" class="to-top-link">Back to top</a>
+ </nav>
+ </div>
+
+<ol class="investor-logos" role="contentinfo" aria-label="eLife is funded by these organisations">
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/hhmi.9d0951a2.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/hhmi@2x.e63a8d68.webp 2x, /assets/images/investors/hhmi@1x.c1e8d1b9.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/hhmi@2x.58718155.png 2x, /assets/images/investors/hhmi@1x.ad4627a8.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/hhmi@1x.ad4627a8.png"
+
+ alt="Howard Hughes Medical Institute"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/wellcome.813f8634.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/wellcome@2x.993dd002.webp 2x, /assets/images/investors/wellcome@1x.1fd7fa84.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/wellcome@2x.75f8d6f9.png 2x, /assets/images/investors/wellcome@1x.ff6d9292.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/wellcome@1x.ff6d9292.png"
+
+ alt="Wellcome Trust"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/max.090f7458.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/max@2x.3215c512.webp 2x, /assets/images/investors/max@1x.8fabbf5a.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/max@2x.d233b5b1.png 2x, /assets/images/investors/max@1x.5daaf9a0.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/max@1x.5daaf9a0.png"
+
+ alt="Max-Planck-Gesellschaft"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/kaw.c1bb2e4b.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/kaw@2x.0afbcf57.webp 2x, /assets/images/investors/kaw@1x.04f3c517.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/kaw@2x.cc1a5adc.png 2x, /assets/images/investors/kaw@1x.318b49a9.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/kaw@1x.318b49a9.png"
+
+ alt="Knut and Alice Wallenberg Foundation"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+</ol>
+
+<footer class="site-footer">
+
+ <div class="site-footer__container">
+
+ <div class="grid-cell">
+
+ <nav class="footer-navigation">
+ <ul class="footer-navigation__list">
+ <li class="footer-navigation__list_item">
+ <a href="/about" class="footer-navigation__list_link">About</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/jobs" class="footer-navigation__list_link">Jobs</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/who-we-work-with" class="footer-navigation__list_link">Who we work with</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/alerts" class="footer-navigation__list_link">Alerts</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/contact" class="footer-navigation__list_link">Contact</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/terms" class="footer-navigation__list_link">Terms and conditions</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/privacy" class="footer-navigation__list_link">Privacy notice</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/inside-elife" class="footer-navigation__list_link">Inside eLife</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/archive/2019" class="footer-navigation__list_link">Monthly archive</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/labs" class="footer-navigation__list_link">Innovation</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/for-the-press" class="footer-navigation__list_link">For the press</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/resources" class="footer-navigation__list_link">Resources</a>
+ </li>
+ </ul>
+ </nav>
+
+ <div class="github-link-wrapper">
+ <a href="https://github.com/elifesciences" class="github-link">
+ <div class="github-link--text">Find us on GitHub</div>
+ </a>
+ </div>
+
+ </div>
+
+ <div class="grid-cell">
+
+ <div class="site-smallprint">
+ <small>eLife is a non-profit organisation inspired by research funders and led by scientists. Our mission is to help scientists accelerate discovery by operating a platform for research communication that encourages and recognises the most responsible behaviours in science.</small>
+ <small>eLife Sciences Publications, Ltd is a limited liability non-profit non-stock corporation incorporated in the State of Delaware, USA, with company number 5030732, and is registered in the UK with company number FC030576 and branch number BR015634 at the address:</small>
+
+ <address>
+ eLife Sciences Publications, Ltd<br>
+ Westbrook Centre, Milton Road<br>
+ Cambridge CB4 1YG<br>
+ UK
+ </address>
+ </div>
+
+ </div>
+
+ <div class="grid-cell">
+ <div class="site-smallprint site-smallprint__copyright">
+ <small>© <time>2019</time> eLife Sciences Publications Ltd. Subject to a <a href="https://creativecommons.org/licenses/by/4.0/" rel="license" class="site-smallprint__copyright_link">Creative Commons Attribution license</a>, except where otherwise noted. ISSN:&nbsp;2050-084X</small>
+ </div>
+ </div>
+
+ </div>
+
+</footer>
+
+
+ </div>
+
+ </div>
+ <link href="/assets/patterns/css/all.ad4007d5.css" rel="stylesheet">
+
+
+<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","licenseKey":"c53c018d69","applicationID":"29775807","transactionName":"NQQGNUZZWEACVhdZWQxOJQJAUVldTFQRRF8BDQE=","queueTime":0,"applicationTime":287,"atts":"GUMFQw5DS04=","errorBeacon":"bam.nr-data.net","agent":""}</script></body>
+
+</html>
diff --git a/mapreduce/tests/files/example.cdx b/python/tests/files/example.cdx
index 84e3271..84e3271 100644
--- a/mapreduce/tests/files/example.cdx
+++ b/python/tests/files/example.cdx
diff --git a/python/tests/files/example_grobid_metadata.json b/python/tests/files/example_grobid_metadata.json
new file mode 100644
index 0000000..a2d18db
--- /dev/null
+++ b/python/tests/files/example_grobid_metadata.json
@@ -0,0 +1,5 @@
+{"abstract": "In this paper an analytical model is presented for the Micro-Cantilever (MC) of Atomic Force Microscopy with Side Wall probe (AFM-SW) in the tapping excitation mode. In this model the couple motion of the MC is taken into account while the torsional motion is considered as an undesirable motion which is coupled with the vertical motion. To this end, the effect of several parameters, namely; probe mass, probe dislocation, sidewall extension length, and tip sample interaction force is investigated on the occurrence probability of torsional and vertical motions. It is found that the probe dislocation is the prerequisite factor of the undesired motion happening. For sake of validation, the analytical results are compared against the previously published results, and an excellent agreement is observed. Abstrak Dalam kertas ini, model analitikal dipersembahkan bagi micro-julur Mikroskop Daya Atom dengan prob dinding-sisi dan dalam mod pengujaan menoreh. Dalam model ini, gerakan pasangan bagi mikro-julur diambil kira manakala gerakan kilasan dianggap sebagai gerakan yang tidak diingini yang digandingkan dengan pergerakan menegak. Untuk tujuan ini , kesan daripada beberapa parameter, iaitu; jisim prob, kehelan prob, panjang lanjutan sisi, dan daya interaksi di antara tip dan sampel disiasat keatas kebarangkalian berlakunya gerakan kilasan dan menegak. Didapati bahawa kehelan prob adalah faktor prasyarat berlakunya gerakan yang tidak diingini. Untuk pengesahan, keputusan analisis ini dibandingkan dengan keputusan yang sebelum ini telah diterbitkan, dan didapati persetujuannya sangat baik. Kata kunci: Mokroskop daya atom, prob dind ing sisi, micro-jalur, getaran, gerakan pasangan", "acknowledgement": "Acknowledgement We are grateful for the UTM scholarship to Author 1. Authors gratefully acknowledge t he Research Institute of Petroleum Industry (RIPI) and the Iran Nanotechnology Laboratory Network (INLN) for their support.", "authors": [{"name": "Farzad Mokhtarinezhad"}, {"name": "Roslan Rahman"}, {"name": "Sina Eftekhar"}, {"name": "Sadegh Hassani"}], "citations": [{"authors": [{"name": "Julie Last"}, {"name": "Paul Russell"}, {"name": "P aul Nealey"}, {"name": "Christopher Murphy"}], "date": "2010", "id": "b0", "index": 0, "issue": null, "journal": "Investigative Ophthalmology & Visual Science", "publisher" : null, "title": "The applications of atomic force microscopy to vision science", "url": null, "volume": "51"}, {"authors": [{"name": "G Binnig"}, {"name": "C Quate"}, {"name": "C Geber"}], "date": "1986", "id": "b1", "index": 1, "issue": null, "journal": "Phys Rev Let", "publisher": null, "title": "Atomic force microscope", "url": null, "vol ume": "56"}, {"authors": [{"name": "C Wright"}, {"name": "Armstrong"}], "date": "2006", "id": "b2", "index": 2, "issue": null, "journal": "Surf Interface Anal", "publisher" : null, "title": "The application of atomic force microscopy force measurements to the characterisation of microbial surfaces", "url": null, "volume": "38"}, {"authors": [{ "name": "John Withers"}, {"name": "D Aston"}], "date": "2006", "id": "b3", "index": 3, "issue": null, "journal": "Advances in Colloid and Interface Science", "publisher": null, "title": "Nanomechanical measurements with AFM in the elastic limit", "url": null, "volume": "120"}, {"authors": [{"name": "Dara Bayat"}, {"name": "Terunobu Akiyama"}, {"name": "F Nicolaas"}, {"name": "Urs De Rooij"}, {"name": "Staufer"}], "date": "2008", "id": "b4", "index": 4, "issue": null, "journal": "Microelectronic Engineering", "p ublisher": null, "title": "Dynamic behavior of the tuning fork AFM probe", "url": null, "volume": "85"}, {"authors": [{"name": "M Kahrobaiyan"}, {"name": "M Ahmadian"}, {"name": "P Haghighi"}, {"name": "A Haghighi"}], "date": "2010", "id": "b5", "index": 5, "issue": null, "journal": "International Journal of Mechanical Sciences", "publisher": null, "title": "Sensitivity and resonant frequency of an AFM with sidewall and top-surface probes for both flexural and torsional modes", "url": null, "volume": "52"}, {"a uthors": [{"name": "Gaoliang Dai"}, {"name": "Helmut Wolff"}, {"name": "Frank Pohlenz"}, {"name": "Hans-Ulrich Danzebrink"}, {"name": "G5Cu00fcnter Wilkening"}], "date": "2006", "id": "b6", "index": 6, "issue": null, "journal": "APPLIED PHYSICS LETTERS", "publisher": null, "title": "Atomic force probe for sidewall scanning of nano-and micro structures", "url": null, "volume": "88"}, {"authors": [{"name": "Gaoliang Dai"}, {"name": "Helmutwolff"}, {"name": "Min Thomasweimann"}, {"name": "Frank Xu"}, {"name": "Ha ns-Ulrich Pohlenz"}, {"name": "Danzebrink"}], "date": "2007", "id": "b7", "index": 7, "issue": null, "journal": "Meas. Sci. Technol", "publisher": null, "title": "Nanoscale surface measurements at sidewalls of nanoand micro-structures", "url": null, "volume": "18"}, {"authors": [{"name": "Win-Jin Chang"}, {"name": "Haw-Long Lee"}, {"name": "T erry Yuan-Fang Chen"}], "date": "2008", "id": "b8", "index": 8, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Study of the sensitivity of the fi rst four flexural modes of an AFM cantilever with a sidewall probe", "url": null, "volume": "108"}, {"authors": [{"name": "Xiaohui Tang"}, {"name": "Vincent Bayot"}, {"name": "Nicolas Reckinger"}, {"name": "Denis Flandre"}, {"name": "Jean-Pierre Raskin"}, {"name": "Emmanuel Dubois"}, {"name": "Bernard Nysten"}], "date": "2009", "id": "b9", "i ndex": 9, "issue": null, "journal": "IEEE Transactions on Nanotechnogoly", "publisher": null, "title": "A Simple Method for Measuring Si-Fin Sidewall Roughness by AFM", "ur l": null, "volume": "8"}, {"authors": [{"name": "Ali Hossein Nejat Pishkenari"}, {"name": "Meghdari"}], "date": "2011", "id": "b10", "index": 10, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Influence of the tip mass on the tip-sample interactions in TM-AFM", "url": null, "volume": "111"}, {"authors": [{"name": "S ohrab Eslami"}, {"name": "Naderjalili"}], "date": "2012", "id": "b11", "index": 11, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "A comprehensiv e modeling and vibration analysis of AFM microcantilevers subjected to nonlinear tip-sample interaction forces", "url": null, "volume": "117"}, {"authors": [{"name": "Yaxin Song"}, {"name": "Bharat Bhushan"}], "date": "2006", "id": "b12", "index": 12, "issue": null, "journal": "Journal of Applied Physics", "publisher": null, "title": "Couplin g of cantilever lateral bending and torsion in torsional resonance and lateral excitation modes of atomic force microscopy", "url": null, "volume": "99"}, {"authors": [{"name": "Haw-Long Lee"}, {"name": "Win-Jin Chang"}], "date": "2008", "id": "b13", "index": 13, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Couple d lateral bending-torsional vibration sensitivity of atomic force microscope cantilever", "url": null, "volume": "108"}, {"authors": [{"name": "Farzad Mokhtarinezhad"}], "d ate": "2015", "id": "b14", "index": 14, "issue": null, "journal": null, "publisher": null, "title": "Jurnal Teknologi (Sciences & Engineering)", "url": null, "volume": "76" }, {"authors": [{"name": "F Mokhtari-Nezhad"}, {"name": "A Saidi"}, {"name": "S Ziaei-Rad"}], "date": "2009", "id": "b15", "index": 15, "issue": null, "journal": "Ultramicr oscopy", "publisher": null, "title": "Influence of the tip mass and position on the AFM cantilever dynamics: Coupling between bending, torsion and flexural modes", "url": null, "volume": "109"}, {"authors": [{"name": "Arvind Raman"}, {"name": "John Melcher"}, {"name": "Ryan Tung"}], "date": "2008", "id": "b16", "index": 16, "issue": null, "jo urnal": "Nanotodays", "publisher": null, "title": "Cantilever dynamics in atomic force microscopy", "url": null, "volume": "3"}, {"authors": [{"name": "Nader Jalili"}, {"name": "Karthik Laxminarayana"}], "date": "2004", "id": "b17", "index": 17, "issue": null, "journal": "Mechatronic", "publisher": null, "title": "A review of atomic force mic roscopy imaging systems: application to molecular metrology and biological sciences", "url": null, "volume": "14"}, {"authors": [{"name": "B Derjaguin"}, {"name": "V Muller "}, {"name": "Y Toporov"}], "date": "1975", "id": "b18", "index": 18, "issue": null, "journal": "J. Colloid Interf. Sci", "publisher": null, "title": "Effect of contact def ormations on the adhesion of particles", "url": null, "volume": "53"}, {"authors": [{"name": "Yaxin Song"}, {"name": "Bharat Bhushan"}], "date": "2006", "id": "b19", "index ": 19, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Simulation of dynamic modes of atomic force microscopy using a 3D finite element model", "u rl": null, "volume": "106"}, {"authors": [{"name": "K Johnson"}, {"name": "K Kendall"}, {"name": "A Roberts"}], "date": "1971", "id": "b20", "index": 20, "issue": null, "jo urnal": "Proc. R. Soc. London Ser. A", "publisher": null, "title": "Surface energy and the contact of elastic solids", "url": null, "volume": "324"}, {"authors": [{"name": "D Gorman"}], "date": "1975", "id": "b21", "index": 21, "issue": null, "journal": null, "publisher": null, "title": "Free Vibration Analysis of Beams and Shafts", "url": null, "volume": null}, {"authors": [{"name": "M Mahdavi"}, {"name": "A Farshidianfar"}, {"name": "M Tahani"}, {"name": "S Mahdavi"}, {"name": "H Dalir"}], "date": "2008", "id ": "b22", "index": 22, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "A more comprehensive modeling of atomic force microscope cantilever", "url" : null, "volume": "109"}, {"authors": [{"name": "M Reinstadtler"}, {"name": "U Rabe"}, {"name": "V Scherer"}, {"name": "U Hartmann"}, {"name": "A Goldade"}, {"name": "B Bhu shan"}, {"name": "W Arnold"}], "date": "2003", "id": "b23", "index": 23, "issue": null, "journal": "Applied physics letters", "publisher": null, "title": "On the nanoscale measurement of friction using atomic-force microscope cantilever torsional resonances", "url": null, "volume": "82"}, {"authors": [{"name": "M Reinst5Cu00e4dtler"}, {"name": "T Kasai"}, {"name": "U Rabe"}, {"name": "B Bhushan"}, {"name": "W Arnold"}], "date": "2005", "id": "b24", "index": 24, "issue": null, "journal": "Journal of Physics D: Applied Physics", "publisher": null, "title": "Imaging and measurement of elasticity and friction using the TRmode", "url": null, "volume": "38"}], "date": "2015", "doi": null, "journal": {"eissn": null, "issn": null, "issue": null, "name": null, "publisher": null, "volume": "76"}, "title": "Jurnal Teknologi Full Paper INVESTIGATION OF TORSI ONAL DEFLECTION AS AN UNDESIRED MOTION IN ATOMIC FORCE MICROSCOPY WITH SIDEWALL PROBE"}
+{"abstract": "Eight months after triple valve replacement with Bjork-Shiley tilting disc valves a patient developed symptoms and signs suggesting malfunction of the prosthesis in the tricuspid position. This was confirmed by echocardiography and angiocardiography, and at operation the di sc of the prosthesis was found to be stuck half-open by fibrin and clot. A further 11 patients with the same type of prosthesis in the tricuspid position were then studied by phonocardiography and echocardiography. In one of these the prosthesis was found to be stuck and this was confirmed by angiocardiography and surgery. These 2 cases are r eported in detail and thefindings in the other 10 are discussed. The implications of this high incidence of malfunction of the Bj6rk-Shiley prosthesis in the tricuspid posi tion are considered. Echocardiography appears to be essential in the follow-up of such patients.", "acknowledgement": null, "authors": [{"name": "P Bourdillon"}, {"name": " G Sharratt"}], "citations": [{"authors": [{"name": "J Assad-Morell"}, {"name": "A Tajik"}, {"name": "M Anderson"}, {"name": "R Tancredi"}, {"name": "R Wallace"}, {"name": " E Giuliani"}], "date": "1974", "id": "b0", "index": 0, "issue": null, "journal": "Mayo Clinic Proceedings", "publisher": null, "title": "Malfunctioning tricuspid valve pros thesis", "url": null, "volume": "49"}, {"authors": [{"name": "R Bache"}, {"name": "A From"}, {"name": "A Castaneda"}, {"name": "C Jorgensen"}, {"name": "Wang"}, {"name": "Y "}], "date": "1972", "id": "b1", "index": 1, "issue": null, "journal": "Chest", "publisher": null, "title": "Late thrombotic obstruction of Starr-Edwards tricuspid valve pr osthesis", "url": null, "volume": null}, {"authors": [{"name": "I Belenkie"}, {"name": "M Carr"}, {"name": "R Schlant"}, {"name": "D Nutter"}, {"name": "P Symbas"}], "date" : "1973", "id": "b2", "index": 2, "issue": null, "journal": "American Heart,Journal", "publisher": null, "title": "Malfunction of a Cutter Smeloff mitral ball valve prosthe sis: diagnosis by phonocardiography and echocardiography", "url": null, "volume": "86"}, {"authors": [{"name": "J Douglas"}, {"name": "Williams"}, {"name": "G"}], "date": " 1974", "id": "b3", "index": 3, "issue": null, "journal": "Circulation", "publisher": null, "title": "Echocardiographic evaluation of the Bjork-Shiley prosthetic valve", "ur l": null, "volume": "50"}, {"authors": [{"name": "J Gimenez"}, {"name": "W Winters"}, {"name": "Jr"}, {"name": "J Davila"}, {"name": "J Connell"}, {"name": "K Klein"}], "da te": "1965", "id": "b4", "index": 4, "issue": null, "journal": "American Journal of the Medical Sciences", "publisher": null, "title": "Dynamics of the StarrEdwards ball va lve prosthesis: a cine-fluorographic and ultrasonic study in humans", "url": null, "volume": "250"}, {"authors": [{"name": "M Johnson"}, {"name": "J Holmes"}, {"name": "Pat on"}, {"name": "B"}], "date": "1973", "id": "b5", "index": 5, "issue": null, "journal": "Circulation", "publisher": null, "title": "Echocardiographic determination of mitra l disc valve excursion", "url": null, "volume": "47"}, {"authors": [{"name": "M Johnson"}, {"name": "B Paton"}, {"name": "J Holmes"}], "date": "1970", "id": "b6", "index": 6, "issue": null, "journal": "Circulation", "publisher": null, "title": "Ultrasonic evaluation of prosthetic valve motion", "url": null, "volume": null}, {"authors": [{"name": "H Miller"}, {"name": "D Gibson"}, {"name": "J Stephens"}], "date": "1973", "id": "b7", "index": 7, "issue": null, "journal": "British Heart Journal", "publisher": null , "title": "Role of echocardiography and phonocardiography in diagnosis of mitral paraprosthetic regurgitation with Starr-Edwards prostheses", "url": null, "volume": "35"}, {"authors": [{"name": "P Oliva"}, {"name": "M Johnson"}, {"name": "M Pomerantz"}, {"name": "Levene"}, {"name": "A"}], "date": "1973", "id": "b8", "index": 8, "issue": null , "journal": "American journal of Cardiology", "publisher": null, "title": "Dysfunction of the Beall mitral prosthesis and its detection by cinefluoroscopy and echocardiogr aphy", "url": null, "volume": null}, {"authors": [{"name": "J Pfeifer"}, {"name": "N Goldschlager"}, {"name": "T Sweatman"}, {"name": "F Gerbode"}, {"name": "A Selzer"}], " date": "1972", "id": "b9", "index": 9, "issue": null, "journal": "American J7ournal of Cardiology", "publisher": null, "title": "Malfunction of mitral ball valve prosthesis due to thrombus: report of 2 cases with notes on early clinical diagnosis", "url": null, "volume": "29"}, {"authors": [{"name": "H Samaan"}, {"name": "R Murali"}], "date": "1970", "id": "b10", "index": 10, "issue": null, "journal": "Thorax", "publisher": null, "title": "Acute tricuspid valve obstruction following the use of tricuspid ball va lve prosthesis", "url": null, "volume": null}, {"authors": [{"name": "S Suwansirikul"}, {"name": "E Glassman"}, {"name": "F Raia"}, {"name": "F Spencer"}], "date": "1974", "id": "b11", "index": 11, "issue": null, "journal": "American J'ournal of Cardiology", "publisher": null, "title": "Late thrombosis of Starr-Edwards tricuspid ball valve pr osthesis", "url": null, "volume": "34"}, {"authors": [{"name": "Vander"}, {"name": "J Veer"}, {"name": "Jr"}, {"name": "G Rhyneer"}, {"name": "R Hodam"}, {"name": "F Kloste r"}], "date": "1971", "id": "b12", "index": 12, "issue": null, "journal": "Circulation", "publisher": null, "title": "Obstruction of tricuspid ball-valve prostheses", "url" : null, "volume": null}, {"authors": [{"name": "W Winters"}, {"name": "Jr"}, {"name": "J Gimenez"}, {"name": "L Soloff"}], "date": "1967", "id": "b13", "index": 13, "issue" : null, "journal": "American journal of Cardiology", "publisher": null, "title": "Clinical application of ultrasound in the analysis of prosthetic ball valve function", "ur l": null, "volume": "19"}, {"authors": [{"name": "P D V Requests For Reprints To Dr"}, {"name": "Western Bourdillon"}, {"name": "Hospital"}, {"name": "Oakley Road"}], "date ": false, "id": "b14", "index": 14, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}], "date": "1976", "doi": null, "journal": {"eissn": null, "issn": null, "issue": null, "name": "British Heart Journal", "publisher": null, "volume": "38"}, "title": "Malfunction of Bjork-Shiley valve prosthesis in tricuspid positi on"}
+{"abstract": "The interference is the major factor disrupting the sending of information in wireless networks. To ge t better performance for these networks as well in the conventional case as in cooperative one, all the necessary ways must be used to eliminate network interference. This article deals with the concept of Physical Layer Network Coding (PLNC). It is a way to exploit the operation of Network Coding (NC) that occurs naturally in the superimpose d electromagnetic waves (EM). It is a simple physical effect when several EM waves meet in the same physical space, they are mixed together. This mixture of EM waves is a f orm of NC produced by nature. Hence, the situation will be reversed and the interference will be a beneficial way to help the relay when sending information. This paper foc uses on the Symbol Error Rate (SER) Analysis of PLNC in the case of 16QAM modulator. It will exploit in detail the concept of mapping (modulation/demodulation) and will dem onstrate its contribution compared to NC and Traditional Network (TN).", "acknowledgement": "Conclusion In this paper, we took a brief description of different cases of coo perative networks in the case of TWRC. We describe the Traditional cooperative Networks, then the Network Coding, and finally, the Physical Layer Network Coding. This one a llows us to reduce the number of phases from 4 to 2. Furthermore, this paper illustrates that in PLNC and for the case of 16QAM constellation, the SER is lower than in the standard modulation case. This is verified with the modulation/demodulation study done and for the In-phase and quadrature case of the modulator.", "authors": [{"name": "R Hajji"}, {"name": "N Hamdi"}], "citations": [{"authors": [{"name": "R Hajji"}, {"name": "N Hamdi"}], "date": "2012", "id": "b0", "index": 0, "issue": null, "journal": "IEEE Electrotechnical Conference (MELECON)", "publisher": null, "title": "Optimizing of Power Allocation for Two-Hop DF Relaying Systems", "url": null, "volume": null}, {"autho rs": [{"name": "J Proakis"}], "date": "1989", "id": "b1", "index": 1, "issue": null, "journal": null, "publisher": null, "title": "Digital Communication", "url": null, "vol ume": null}, {"authors": [{"name": "S Tian"}, {"name": "Li Yonghui"}, {"name": "B Vucetic"}], "date": "2011", "id": "b2", "index": 2, "issue": null, "journal": "IEEE ICC", "publisher": null, "title": "A Near Optimal Amplify and Forward Relaying in Two-Way Relay Networks", "url": null, "volume": null}, {"authors": [{"name": "S Zhang"}, {"name" : "S Liew"}, {"name": "P Lam"}], "date": "2006", "id": "b3", "index": 3, "issue": null, "journal": null, "publisher": null, "title": "Physical Layer Network Coding. ACM Mob iCom", "url": null, "volume": null}], "date": "2013", "doi": null, "journal": {"eissn": null, "issn": null, "issue": "3", "name": "AWERProcedia Information Technology & Com puter Science", "publisher": null, "volume": "03"}, "title": "SER Analysis of Two-Hop Physical Layer Network Coding with 16QAM Modulator, AWERProcedia Information Technolog y & Computer Science"}
+{"abstract": "Suffix trees are by far the most important data structure in stringology, with myriads of applications in fields like bioinformatics and information retrieval. Classical representations of suffix trees require O(n log n) bits of space, for a string of size n. This is consid erably more than the n log 2 5Cu03c3 bits needed for the string itself, where 5Cu03c3 is the alphabet size. The size of suffix trees has been a barrier to their wider a doption in practice. Recent compressed suffix tree representations require just the space of the compressed string plus 5Cu0398(n) extra bits. This is already spectacular , but still unsatisfactory when 5Cu03c3 is small as in DNA sequences. In this paper we introduce the first compressed suffix tree representation that breaks this linear-s pace barrier. Our representation requires sublinear extra space and supports a large set of navigational operations in logarithmic time. An essential ingredient of our repr esentation is the lowest common ancestor (LCA) query. We reveal important connections between LCA queries and suffix tree navigation.", "acknowledgement": null, "authors": [{"name": "Lu5Cu00eds Russo"}, {"name": "Gonzalo Navarro"}, {"name": "Arlindo Oliveira"}], "citations": [{"authors": [{"name": "A Apostolico"}], "date": "1985", "id": "b0 ", "index": 0, "issue": null, "journal": "Combinatorial Algorithms on Words. NATO ISI Series", "publisher": null, "title": "The myriad virtues of subword trees", "url": null, "volume": null}, {"authors": [{"name": "M Bender"}, {"name": "M Farach-Colton"}], "date": "2000", "id": "b1", "index": 1, "issue": null, "journal": "Proceedings of LATIN ", "publisher": null, "title": "The LCA problem revisited", "url": null, "volume": "1776"}, {"authors": [{"name": "M Bender"}, {"name": "M Farach-Colton"}], "date": "2004", "id": "b2", "index": 2, "issue": "1", "journal": "Theor. Comp. Sci", "publisher": null, "title": "The level ancestor problem simplified", "url": null, "volume": "321"}, {" authors": [{"name": "M Farach"}], "date": "1997", "id": "b3", "index": 3, "issue": null, "journal": "Proceedings of FOCS", "publisher": null, "title": "Optimal suffix tree construction with large alphabets", "url": null, "volume": null}, {"authors": [{"name": "P Ferragina"}, {"name": "G Manzini"}, {"name": "V M5Cu00e4kinen"}, {"name": "G Na varro"}], "date": "2007", "id": "b4", "index": 4, "issue": "2", "journal": "ACM Trans. Algor", "publisher": null, "title": "Compressed representations of sequences and full -text indexes", "url": null, "volume": "3"}, {"authors": [{"name": "J Fischer"}, {"name": "V Heun"}], "date": "2007", "id": "b5", "index": 5, "issue": null, "journal": "Pro ceedings of ESCAPE", "publisher": null, "title": "A new succinct representation of RMQ-information and improvements in the enhanced suffix array", "url": null, "volume": "4 614"}, {"authors": [{"name": "L Foschini"}, {"name": "R Grossi"}, {"name": "A Gupta"}, {"name": "J Vitter"}], "date": "2006", "id": "b6", "index": 6, "issue": "4", "journal ": "ACM Trans. Algor", "publisher": null, "title": "When indexing equals compression: Experiments with compressing suffix arrays and applications", "url": null, "volume": " 2"}, {"authors": [{"name": "R Geary"}, {"name": "R Raman"}, {"name": "V Raman"}], "date": "2004", "id": "b7", "index": 7, "issue": null, "journal": "Proceedings of SODA", " publisher": null, "title": "Succinct ordinal trees with level-ancestor queries", "url": null, "volume": null}, {"authors": [{"name": "R Giegerich"}, {"name": "S Kurtz"}, {"name": "J Stoye"}], "date": "2003", "id": "b8", "index": 8, "issue": "11", "journal": "Softw., Pract. Exper", "publisher": null, "title": "Efficient implementation of lazy suffix trees", "url": null, "volume": "33"}, {"authors": [{"name": "D Gusfield"}], "date": "1997", "id": "b9", "index": 9, "issue": null, "journal": null, "publisher": null , "title": "Algorithms on Strings, Trees and Sequences", "url": null, "volume": null}, {"authors": [{"name": "D Knuth"}, {"name": "J"}, {"name": "V Pratt"}], "date": "1977" , "id": "b10", "index": 10, "issue": "2", "journal": "SIAM J. Comput", "publisher": null, "title": "Fast pattern matching in strings", "url": null, "volume": "6"}, {"author s": [{"name": "S Lee"}, {"name": "K Park"}], "date": "2007", "id": "b11", "index": 11, "issue": null, "journal": "Proceedings of CPM", "publisher": null, "title": "Dynamic rank-select structures with applications to run-length encoded texts", "url": null, "volume": "4580"}, {"authors": [{"name": "V M5Cu00e4kinen"}, {"name": "G Navarro"}], " date": "2006", "id": "b12", "index": 12, "issue": null, "journal": "Proceedings of CPM", "publisher": null, "title": "Dynamic entropy-compressed sequences and full-text ind exes", "url": null, "volume": "4009"}, {"authors": [{"name": "U Manber"}, {"name": "E Myers"}], "date": "1993", "id": "b13", "index": 13, "issue": "5", "journal": "SIAM J. Comput", "publisher": null, "title": "Suffix arrays: A new method for on-line string searches", "url": null, "volume": "22"}, {"authors": [{"name": "G Manzini"}], "date": " 2001", "id": "b14", "index": 14, "issue": "3", "journal": "J. ACM", "publisher": null, "title": "An analysis of the Burrows-Wheeler transform", "url": null, "volume": "48"} , {"authors": [{"name": "E Mccreight"}], "date": "1976", "id": "b15", "index": 15, "issue": "2", "journal": "J. ACM", "publisher": null, "title": "A space-economical suffix tree construction algorithm", "url": null, "volume": "32"}, {"authors": [{"name": "G Navarro"}, {"name": "V M5Cu00e4kinen"}], "date": "2007", "id": "b16", "index": 16, " issue": "1", "journal": "ACM Comp. Surv", "publisher": null, "title": "Compressed full-text indexes", "url": null, "volume": "39"}, {"authors": [{"name": "R Raman"}, {"name": "V Raman"}, {"name": "S Rao"}], "date": "2002", "id": "b17", "index": 17, "issue": null, "journal": "Proceedings of SODA", "publisher": null, "title": "Succinct indexabl e dictionaries with applications to encoding k-ary trees and multisets", "url": null, "volume": null}, {"authors": [{"name": "L Russo"}, {"name": "A Oliveira"}], "date": "2 006", "id": "b18", "index": 18, "issue": null, "journal": "Proceedings of SPIRE", "publisher": null, "title": "A compressed self-index using a Ziv-Lempel dictionary", "url" : null, "volume": "4209"}, {"authors": [{"name": "K Sadakane"}], "date": "2003", "id": "b19", "index": 19, "issue": "2", "journal": "J. of Algorithms", "publisher": null, " title": "New text indexing functionalities of the compressed suffix arrays", "url": null, "volume": "48"}, {"authors": [{"name": "K Sadakane"}], "date": "2007", "id": "b20" , "index": 20, "issue": null, "journal": "Theo. Comp. Sys", "publisher": null, "title": "Compressed Suffix Trees with Full Functionality", "url": null, "volume": null}, {"a uthors": [{"name": "E Ukkonen"}], "date": "1995", "id": "b21", "index": 21, "issue": "3", "journal": "Algorithmica", "publisher": null, "title": "Construting suffix trees o n-line in linear time", "url": null, "volume": "14"}, {"authors": [{"name": "N V5Cu00e4lim5Cu00e4ki"}, {"name": "W Gerlach"}, {"name": "K Dixit"}, {"name": "V M5Cu00e 4kinen"}], "date": "2007", "id": "b22", "index": 22, "issue": null, "journal": "Proceedings of WEA", "publisher": null, "title": "Engineering a compressed suffix tree imple mentation", "url": null, "volume": "4525"}, {"authors": [{"name": "P Weiner"}], "date": "1973", "id": "b23", "index": 23, "issue": null, "journal": "Proceedings of IEEE Sym p. on Switching and Automata Theory", "publisher": null, "title": "Linear pattern matching algorithms", "url": null, "volume": null}], "date": false, "doi": null, "journal" : {"eissn": null, "issn": null, "issue": null, "name": null, "publisher": null, "volume": null}, "title": "Fully-Compressed Suffix Trees"}
+{"abstract": null, "acknowledgement": null, "authors": [{"name": "Carine Van Huls Van Taxis"}, {"name": "Sebastiaan Piers"}, {"name": "Marta De Riva Silva"}, {"name": "Olaf Dekkers"}, {"name": "Dani5Cu00ebl Pijnappels"}, {"name": "Martin Schalij"}, {"name": "Adrianus Wijnmaalen"}, {"name": "Katja Zeppenfeld"}], "citations": [{"authors": [{"name": "T Baman"}, {"name": "D Lange"}, {"name": "K Ilg"}, {"name": "S Gupta"}, {"name": "T Liu"}, {"name": "C Algui re"}, {"name": "W Armstrong"}, {"name": "E Good"}, {"name": "A Chugh"}, {"name": "K Jongnarangsin"}, {"name": "F Pelosi"}, {"name": "Jr Crawford"}, {"name": "T Ebinger"}, { "name": "M Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2010", "id": "b0", "index": 0, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Relationship between burden of premature ventricular complexes and left ventricular function", "url": null, "volume": "7"}, {"authors": [{"name": "M Yokoka wa"}, {"name": "H Kim"}, {"name": "E Good"}, {"name": "A Chugh"}, {"name": "F Pelosi"}, {"name": "Jr Alguire"}, {"name": "C Armstrong"}, {"name": "W Crawford"}, {"name": "T Jongnarangsin"}, {"name": "K Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2012", "id": "b1", "index": 1, "issue": null, "journal": "Heart Rh ythm", "publisher": null, "title": "Relation of symptoms and symptom duration to premature ventricular complex-induced cardiomyopathy", "url": null, "volume": "9"}, {"autho rs": [{"name": "M Yokokawa"}, {"name": "H Kim"}, {"name": "E Good"}, {"name": "T Crawford"}, {"name": "A Chugh"}, {"name": "F Pelosi"}, {"name": "Jr Jongnarangsin"}, {"name": "K Latchamsetty"}, {"name": "R Armstrong"}, {"name": "W Alguire"}, {"name": "C Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2012", "id": " b2", "index": 2, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Impact of QRS duration of frequent premature ventricular complexes on the developmen t of cardiomyopathy", "url": null, "volume": "9"}, {"authors": [{"name": "P Carballeira"}, {"name": "M Deyell"}, {"name": "D Frankel"}, {"name": "D Benhayon"}, {"name": "F Squara"}, {"name": "W Chik"}, {"name": "M Kohari"}, {"name": "R Deo"}, {"name": "F Marchlinski"}], "date": "2014", "id": "b3", "index": 3, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Ventricular premature depolarization QRS duration as a new marker of risk for the development of ventricular premature depolarization- induced cardiomyopathy", "url": null, "volume": "11"}, {"authors": [{"name": "E Aliot"}, {"name": "W Stevenson"}, {"name": "J Almendral-Garrote"}, {"name": "F Bogun"}, {"name": "C Calkins"}, {"name": "E Delacretaz"}, {"name": "B Della"}, {"name": "G Hindricks"}, {"name": "P Jais"}, {"name": "M Josephson"}, {"name": "J Kautzner"}, {"name": "G Kay"}, {"name": "K Kuck"}, {"name": "B Lerman"}, {"name": "F Marchlinski"}, {"name": "V Reddy"}, {"name": "M Schalij"}, {"name": "R Schilling"}, {"name": "K Soejima"}, {"name": "Wilber Ehra/"}], "date": false, "id": "b4", "index": 4, "issue": null, "journal": null, "publisher": null, "title": "HRS Expert Consensus on Catheter Ablation of Ventricular Arrhythmias: developed in a partnership with the European Heart Rhythm Association (EHRA), a Registered Branch of the European Society of Cardiology (ESC), and the Heart Rhythm Society", "url": null, "volume": null}, {"authors": [{"name": "B Ts"}, {"name": "S"}, {"name": "S Ilg"}, {"name": "K Gupta"}, {"name": "S Liu"}, {"name": "T T Ty Y Alguire"}, {"name": "C"}, {"name": "Ar Ar Rms M Tron On Ong"}, {"name": "W Good"}, {"name": "E Chugh"}, {"name": "A A"}, {"name": "A"}, {"name": "J J Jongnaran N Ngs G s Gsin In In K K K"}, {"name": "Pe Pe Pelo Lo Losi Si Si"}, {"name": "F"}, {"name": ","}, {"name": "Jr"}], "date": false, "id": "b5", "index": 5, "issue": null, "journal": null, "publisher": null, "title": "Cra ra rawf wf wfor o o d d d T, T, T, Ebi i in ng ger r M M M", "url": null, "volume": null}, {"authors": [{"name": "M"}, {"name": "M"}, {"name": "M Kim M M H H Hm"}, {"name": "M Goo Oo Od"}, {"name": "E Chugh G G A A A, Pe Pe Pelo Lo Losi S"}, {"name": "F Jr R R. ; W"}, {"name": "Crawford T ; Mo Mo Mora Ra Rady Dy Dy F F F"}, {"name": "Bo Bogu Gu Gun N"}, {"name": "F"}], "date": "2012", "id": "b6", "index": 6, "issue": null, "journal": null, "publisher": null, "title": "Rel el elat at atio io ion n n of of of s s sym ym ympt pt ptom om oms s s an an and d d sy sy symp mp mpto to tom m m du du dur r ration o o o p p pre re rema ma matu tu ture re re v v ve e ent nt ntri ri ricu cu cula la ar r r co co comp mp mple le ex x x-i ind nd duc uc uced", "url": null, "volume": "20"}, {"authors": [{"name": "Cardiology Col lege Of"}], "date": "2009", "id": "b7", "index": 7, "issue": null, "journal": null, "publisher": null, "title": "ACC) and the American Heart Association (AHA). Heart Rhythm ", "url": null, "volume": "6"}, {"authors": [{"name": "D Zipes"}, {"name": "A Camm"}, {"name": "M Borggrefe"}, {"name": "A Buxton"}, {"name": "B Chaitman"}, {"name": "M Fro mer"}, {"name": "G Gregoratos"}, {"name": "G Klein"}, {"name": "A Moss"}, {"name": "R Myerburg"}, {"name": "S Priori"}, {"name": "M Quinones"}, {"name": "D Roden"}, {"name" : "M Silka"}, {"name": "C Tracy"}, {"name": "S Smith"}, {"name": "Jr Jacobs"}, {"name": "A Adams"}, {"name": "C Antman"}, {"name": "E Anderson"}, {"name": "J Hunt"}, {"name": "S Halperin"}, {"name": "J Nishimura"}, {"name": "R Ornato"}, {"name": "J Page"}, {"name": "R Riegel"}, {"name": "B Priori"}, {"name": "S Blanc"}, {"name": "J Budaj"}, { "name": "A Camm"}, {"name": "A Dean"}, {"name": "V Deckers"}, {"name": "J Despres"}, {"name": "C Dickstein"}, {"name": "K Lekakis"}, {"name": "J Mcgregor"}, {"name": "K Met ra"}, {"name": "M Morais"}, {"name": "J Osterspey"}, {"name": "A Tamargo"}, {"name": "J Zamorano"}, {"name": "J"}], "date": "2006", "id": "b8", "index": 8, "issue": null, " journal": "J Am Coll Cardiol", "publisher": null, "title": "ACC/AHA/ESC 2006 guidelines for management of patients with ventricular arrhythmias and the prevention of sudden cardiac death: a report of the American College of Cardiology/American Heart Association Task Force and the European Society of Cardiology Committee for Practice Guideline s (Writing Committee to Develop Guidelines for Management of Patients With Ventricular Arrhythmias and the Prevention of Sudden Cardiac Death)", "url": null, "volume": "48" }, {"authors": [{"name": "Y Sekiguchi"}, {"name": "K Aonuma"}, {"name": "Y Yamauchi"}, {"name": "T Obayashi"}, {"name": "A Niwa"}, {"name": "H Hachiya"}, {"name": "A Takaha shi"}, {"name": "J Nitta"}, {"name": "Y Iesaka"}, {"name": "M Isobe"}], "date": "2005", "id": "b9", "index": 9, "issue": null, "journal": "J Cardiovasc Electrophysiol", "pu blisher": null, "title": "Chronic hemodynamic effects after radiofrequency catheter ablation of frequent monomorphic ventricular premature beats", "url": null, "volume": "1 6"}, {"authors": [{"name": "H Tada"}, {"name": "S Ito"}, {"name": "G Shinbo"}, {"name": "K Tadokoro"}, {"name": "I Ito"}, {"name": "T Hashimoto"}, {"name": "K Miyaji"}, {"name": "K Kaseno"}, {"name": "S Naito"}, {"name": "A Nogami"}, {"name": "S Oshima"}, {"name": "K Taniguchi"}], "date": "2006", "id": "b10", "index": 10, "issue": null, "jour nal": "Pacing Clin Electrophysiol", "publisher": null, "title": "Significance and utility of plasma brain natriuretic peptide concentrations in patients with idiopathic ven tricular arrhythmias", "url": null, "volume": "29"}, {"authors": [{"name": "F Knebel"}, {"name": "I Schimke"}, {"name": "K Pliet"}, {"name": "S Schattke"}, {"name": "S Mart in"}, {"name": "A Borges"}, {"name": "G Baumann"}], "date": "2005", "id": "b11", "index": 11, "issue": null, "journal": "J Card Fail", "publisher": null, "title": "NT-ProBN P in acute heart failure: correlation with invasively measured hemodynamic parameters during recompensation", "url": null, "volume": "11"}, {"authors": [{"name": "R Krittay aphong"}, {"name": "T Boonyasirinant"}, {"name": "P Saiviroonporn"}, {"name": "P Thanapiboonpol"}, {"name": "S Nakyen"}, {"name": "S Udompunturak"}], "date": "2008", "id": "b12", "index": 12, "issue": null, "journal": "J Card Fail", "publisher": null, "title": "Correlation Between NT-pro BNP levels and left ventricular wall stress, sphericity index and extent of myocardial damage: a magnetic resonance imaging study", "url": null, "volume": "14"}, {"authors": [{"name": "S Yuda"}, {"name": "V Khoury"}, {"name": " T Marwick"}], "date": "2002", "id": "b13", "index": 13, "issue": null, "journal": "J Am Coll Cardiol", "publisher": null, "title": "Influence of wall stress and left ventri cular geometry on the accuracy of dobutamine stress echocardiography", "url": null, "volume": "40"}, {"authors": [{"name": "L Krupp"}, {"name": "N Larocca"}, {"name": "J Mu ir-Nash"}, {"name": "A Steinberg"}], "date": "1989", "id": "b14", "index": 14, "issue": null, "journal": "Arch Neurol", "publisher": null, "title": "The fatigue severity sc ale. Application to patients with multiple sclerosis and systemic lupus erythematosus", "url": null, "volume": "46"}, {"authors": [{"name": "F Gustafsson"}, {"name": "F Ste ensgaard-Hansen"}, {"name": "J Badskjaer"}, {"name": "A Poulsen"}, {"name": "P Corell"}, {"name": "P Hildebrandt"}], "date": false, "id": "b15", "index": 15, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}, {"authors": [{"name": "Tad Da"}, {"name": "H"}, {"name": ", Ito To"}, {"name": "S"}, {"name": ", Sh S Inbo" }, {"name": "G Tadokoro"}, {"name": "K Ito"}, {"name": "I"}, {"name": "Has Shi Himoto"}, {"name": "T Miyaji"}, {"name": "K"}, {"name": "Ka"}, {"name": "K Naito"}, {"name": "S No O Oga Ga Gam Mi Mi A A A, O Os Oshi Hi Im M Ma"}, {"name": "S Taniguchi"}, {"name": "K"}], "date": false, "id": "b16", "index": 16, "issue": null, "journal": null, "p ublisher": null, "title": "onc nc centrations n n i i in n n pa p p ti ti tien en ents ts ts w w with h h id id idio io iopa pa path th thic ic ic v v ven en entr tr tricul lar r r arr rr rhy hy hyth th thmi m m as as as", "url": null, "volume": null}, {"authors": [{"name": "K K Kne Ne Nebe"}, {"name": "F"}, {"name": "F"}, {"name": "F"}, {"name": "S S Sch Ch Chim Im Mke Ke Ke I"}, {"name": ","}, {"name": "Pl Pl P Iet T K"}, {"name": "K"}, {"name": "K"}, {"name": "S S Sch Ch Chattk Tk Ke E S"}, {"name": "S"}, {"name": "S Ti Tin N S"}, {"name": "S"}, {"name": "S B B"}], "date": false, "id": "b17", "index": 17, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}, {"authors": [{"name": "S Vickery"}, {"name": "C Price"}, {"name": "R John"}, {"name": "N Abbas"}, {"name": "M Webb"}, {"name": "M Kempson"}, {"name": "E Lamb"}], "da te": "2005", "id": "b18", "index": 18, "issue": null, "journal": "Am J Kidney Dis", "publisher": null, "title": "B-type natriuretic peptide (BNP) and amino-terminal proBNP in patients with CKD: relationship to renal function and left ventricular hypertrophy", "url": null, "volume": "46"}, {"authors": [{"name": "C Van Huls Van Taxis"}, {"name" : "A Wijnmaalen"}, {"name": "D Den Uijl"}, {"name": "M Gawrysiak"}, {"name": "H Putter"}, {"name": "M Schalij"}, {"name": "K Zeppenfeld"}], "date": "2011", "id": "b19", "in dex": 19, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Reversed polarity of bipolar electrograms for predicting a successful ablation site in foca l idiopathic right ventricular outflow tract arrhythmias", "url": null, "volume": "8"}, {"authors": [{"name": "D Penela"}, {"name": "C Van Huls Van Taxis"}, {"name": "L Agu inaga"}, {"name": "J Fernandez-Armenta"}, {"name": "L Mont"}, {"name": "M Castel"}, {"name": "M Heras"}, {"name": "J Tolosana"}, {"name": "M Sitges"}, {"name": "A Ordonez"} , {"name": "J Brugada"}, {"name": "K Zeppenfeld"}, {"name": "A Berruezo"}], "date": "2013", "id": "b20", "index": 20, "issue": null, "journal": "J Am Coll Cardiol", "publis her": null, "title": "Neurohormonal, structural, and functional recovery pattern after premature ventricular complex ablation is independent of structural heart disease sta tus in patients with depressed left ventricular ejection fraction: a prospective multicenter study", "url": null, "volume": "62"}, {"authors": [{"name": "S Niwano"}, {"name": "Y Wakisaka"}, {"name": "H Niwano"}, {"name": "H Fukaya"}, {"name": "S Kurokawa"}, {"name": "M Kiryu"}, {"name": "Y Hatakeyama"}, {"name": "T Izumi"}], "date": "2009", " id": "b21", "index": 21, "issue": null, "journal": "Heart", "publisher": null, "title": "Prognostic significance of frequent premature ventricular contractions originating from the ventricular outflow tract in patients with normal left ventricular function", "url": null, "volume": "95"}, {"authors": [{"name": "L Costello-Boerrigter"}, {"name" : "G Boerrigter"}, {"name": "M Redfield"}, {"name": "R Rodeheffer"}, {"name": "L Urban"}, {"name": "D Mahoney"}, {"name": "S Jacobsen"}, {"name": "D Heublein"}, {"name": "J Burnett"}], "date": "2006", "id": "b22", "index": 22, "issue": null, "journal": "J Am Coll Cardiol", "publisher": null, "title": "Amino-terminal pro-B-type natriuretic pep tide and B-type natriuretic peptide in the general community: determinants and detection of left ventricular dysfunction", "url": null, "volume": "47"}, {"authors": [{"name": "L Co Ostello-Boe Err Rrigter"}, {"name": "G Boerrigter"}, {"name": "Redfield"}, {"name": "M Mm"}, {"name": "R Rodeheffer"}, {"name": ", Ur U Ban"}, {"name": "L Mahoney" }, {"name": "Dw W W"}, {"name": ","}, {"name": "Ja Jacobs Bs Bsen En En"}, {"name": "S S Sj J J Heublein"}, {"name": "D Burnett"}, {"name": "J"}], "date": false, "id": "b23 ", "index": 23, "issue": null, "journal": "mu mu munity ty y: : : de de dete te term rm min in inan", "publisher": null, "title": "Am Am Amino-terminal p p pro r r-B-ty ty type natriuretic peptide an an nd d B-type n na at atri ri riur ur", "url": null, "volume": null}], "date": false, "doi": "10.1161/circep.115.003091", "journal": {"eissn": "1941-3084", "issn": "1941-3149", "issue": null, "name": "Circulation: Arrhythmia and Electrophysiology", "publisher": "Ovid Technologies (Wolters Kluwer Health)", "volume" : null}, "title": "Fatigue as Presenting Symptom and a High Burden of Premature Ventricular Contractions Are Independently Associated with Increased Ventricular Wall Stress in Patients with Normal Left Ventricular Function"}
diff --git a/python/tests/files/first_monday_ojs3_fulltext.html b/python/tests/files/first_monday_ojs3_fulltext.html
new file mode 100644
index 0000000..2248aed
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_fulltext.html
@@ -0,0 +1,441 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<title>Surveillance, stigma and sociotechnical design for HIV</title>
+</head>
+<body bgcolor="#ffffff" LINK="#bb7777" VLINK="#7777bb" ALINK="#ffee99" text="#000000">
+<blockquote><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71629" border="1" alt="First Monday" align="bottom"><br></blockquote>
+<hr>
+<blockquote>
+
+<center><a href="#author"><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71975" alt="Surveillance, stigma and sociotechnical design for HIV by Calvin Liang, Jevan Alexander Hutson, and Os Keyes" border="1"></a></center>
+
+<br><hr><br>
+
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71627" alt="Abstract"><br>Online dating and hookup platforms have fundamentally changed people&rsquo;s day-to-day practices of sex and love &mdash; but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms &ldquo;work&rdquo; for HIV frequently focus on user-to-user interactions and disclosure of one&rsquo;s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+
+<p><strong>Contents</strong></p>
+<p><a href="#p1">Introduction</a><br>
+<a href="#p2">Methods</a><br>
+<a href="#p3">Findings</a><br>
+<a href="#p4">Discussion</a><br>
+<a href="#p5">Conclusion</a></p>
+
+<p>&nbsp;</p><hr><p>&nbsp;</p>
+<p><strong><a name="p1"></a>Introduction</strong></p>
+
+<table width="70%" align="center"><tr><td>&ldquo;AIDS is essentially a crisis of governance, of what governments do and do not do, to and for their people &mdash; we have the drugs to treat HIV infection, we have the tools to confront the risks that drive HIV transmission and prevent infection itself &mdash; what we don&rsquo;t have is national political will necessary to scale-up our response. We have demanded too little from our leaders, excused far too much.&rdquo;<br>&mdash; Gregg Gonsalves, speech at the 2006 Toronto AIDS Conference.</td></tr></table>
+
+<table width="70%" align="center"><tr><td>&ldquo;Design is inherently about change &mdash; not just in the creation of new material artifacts, but in the ways that new technological objects afford new practices, social habits, and ways of living and interacting.&rdquo;<br>&mdash; Dombrowski, <em>et al.</em> (2016). &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments.&rdquo;</td></tr></table>
+
+<p>Living and loving with HIV is a complicated task. HIV status and the stigma attached to it exists within a complex interplay of social norms and medicolegal infrastructures. The medicolegal history of HIV begins the moment that HIV and AIDS emerged, constituting a mix of medically justified legal norms and legally enforced medical requirements. The criminal justice and public health systems of modern states demarcated people living with HIV as a uniquely dangerous population, &ldquo;one that needed to be sought out, tracked down, tested, reported, listed, tagged, monitored, regulated, and, increasingly, criminalized&rdquo; <a name="1a"></a>[<a href="#1">1</a>].</p>
+
+<p>The immediate policy response in the United States imposed significant criminal and civil liability upon people living with HIV (Hoppe, 2018; Harsono, <em>et al.</em>, 2017; Sykes, <em>et al.</em>, 2016; Thrasher, 2015; Galletly, <em>et al.</em>, 2014; Lehman, <em>et al.</em>, 2014; Gagnon, 2012; Pollard, 2006; Gostin, <em>et al.</em>, 1999). Between 1986&ndash;2019, HIV-specific criminal laws and sentence enhancements applicable to people living with HIV have been enacted in 34 states and two U.S. territories (Center for HIV Law &amp; Policy, 2019; Lehman, <em>et al.</em>, 2014). Since 1986, these laws have criminalized nondisclosure of HIV and engagement in &ldquo;risky&rdquo; behaviors such as sexual activity, exposure to bodily fluids, needle sharing, sex work, blood/organ/semen donation, and, in a variety of instances, behaviors posing little, if any, risk of HIV transmission (Center for Disease Control and Prevention, 2019a; Center for HIV Law &amp; Policy, 2019).</p>
+
+<p>Despite claiming medical legitimacy for this punitive approach, researchers have long understood that the criminalization of HIV transmission was instead fueled by the associations between HIV and the gay community and communities of color (Hoppe, 2018; Gallo, 2006; Johnson, 1992; Banks, 1989) at a time when consensual sex between same-sex partners was a criminal offense in twenty-two states and over 61 percent of American evangelicals and 50 percent of non-evangelicals agreed with the statement &ldquo;I sometimes think AIDS is a punishment for the decline in moral standards&rdquo; (Gallup and Castelli, 1987).</p>
+
+<p>A significant body of empirical social science work documents the harmful effects HIV laws have had on the lives of people living with HIV (Barr&eacute;Sinoussi, <em>et al.</em>, 2018; Harsono, <em>et al.</em>, 2017; Sweeney, <em>et al.</em>, 2017; Adam, <em>et al.</em>, 2014). HIV criminalization both reinforces and magnifies HIV-related stigma and discrimination, reduces the willingness of persons at risk for HIV to get tested or seek care, and imperils demographic health collection of information (Harsono, <em>et al.</em>, 2017; Burris and Cameron, 2008; Galletly and Pinkerton, 2006; Elliot, 2002). A survey of over 2,000 people living with HIV in the U.S. revealed that at least 25 percent of respondents knew one or more individuals who were afraid to get tested for fear of facing criminalization (Sero Project, 2012). HIV criminalization also ignores the reality that successful antiretroviral therapy can render the level of the virus to undetectable, which, according to the National Institute of Health, means that HIV is then untransmittable (Eisinger, <em>et al.</em>, 2019).</p>
+
+<p>While HIV transmission was criminalized, other tools of control &mdash; in the form of surveillance &mdash; arose and were enforced. Early policy responses to HIV centered on overt surveillance and ostracism of those infected and perceived to be at risk (Fortin, 1995). This surveillance generally consists of disease reporting, sexual contact tracing, and data collection of people who have been diagnosed with HIV (Fan, 2012; 2011; Ward and Bell, 2014; Ward, 2005). The Center for Disease Control, for example, collects HIV data based on confidential name-based reporting laws implemented in all 50 states as of April 2008 (Center for Disease Control and Prevention, 2019b).</p>
+
+<p>HIV surveillance (and sexually transmitted infection surveillance more broadly) centralizes information and power in the state (Fairchild, <em>et al.</em>, 2007; Fan, 2012); because HIV intervention and surveillance is generally concentrated in lower income communities and health settings (McCree and Hogben, 2010), the most socially and economically marginalized communities bear the heaviest burden of HIV surveillance and its downstream consequences (Miller, <em>et al.</em>, 2004; Banks, 1989; Brandt, 1987). There is a long-racialized history of HIV, one that, in combination with the background racism of the United States, has led to the systemic undertreatment and under-consideration of communities of color (Ford, <em>et al.</em>, 2007; Anonymous, 2000; Johnson, 1992).</p>
+
+<p>This infrastructure of surveillance in turn reinforces the stigma of HIV, which has dramatic consequences for the likelihood of unwanted disclosure, access to care, psychiatric well-being, housing and employment discrimination, and, consequently, quality (or probability) of life (Lazarus, <em>et al.</em>, 2016; Mahajan, <em>et al.</em>, 2008). Coupled with the overarching stigma of HIV and its criminalization in various contexts, HIV surveillance offers a tool through which the state can identify citizens to be punished.</p>
+
+<p>In the era of &ldquo;big data&rdquo; and ubiquitous surveillance capitalism (Zuboff, 2019) &mdash; the private monetization of information about reality &mdash; HIV surveillance is not just in the hands of the state, but also in the hands of private organizations and individuals. In the context of widespread state surveillance and control and ongoing stigmatization of HIV, this opens yet more possibilities for harm through enabling the selling and redistribution of HIV status information, without the user&rsquo;s meaningful consent, to parties who may themselves engage in discrimination or direct violence.</p>
+
+<p>Many online platforms &mdash; including, as we trace out below, dating platforms &mdash; constitute not just spaces for the purposes outlined in their marketing materials but also tools for the police in tracing HIV status and criminalized behavior. In recent years, police have used technology to conduct Internet-based investigations for a similar purpose (POZ, 2015). Police now go undercover on Web sites and dating apps by creating fake identities online (Semitsu, 2011), and local law enforcement agencies and federal agencies increasingly employ these tactics in online investigations (Lichtblau and Arkin, 2014).</p>
+
+<p>Legal and public health scholars and advocates continue to call for a paradigm shift in managing HIV that leaves behind historical responses like surveillance, ostracism, and incarceration and accounts for the rise of the Internet and mobile technology and their impact on sexual attitudes and behaviors (Lehman, <em>et al.</em>, 2014; McCallum, 2014; Fan, 2011; Fenton, 2010). Since the criminalization of HIV, intimate platforms have become vital structures through which millions of people access the opportunity to engage in reciprocal romantic and sexual relationships (Hutson, <em>et al.</em>, 2018; Taylor, <em>et al.</em>, 2017; Rosenfeld and Thomas, 2012). By designing infrastructures for intimate affiliation, intimate platforms wield unmatched structural power to shape who meets whom and how within dating and sexual platforms (Hutson, <em>et al.</em>, 2018; Levy and Barocas, 2018; Emens, 2008; Robinson, 2007). These platforms frame the circumstances within which users understand each other as prospective romantic or sexual partners and shape social norms, sexual scripts, and relative advantages among users (Hardy and Lindtner, 2017; Kannabiran, <em>et al.</em>, 2012).</p>
+
+<p>The design of intimate platforms provides opportunities to explore new ways of managing HIV that reduce the concentration of power and information in the state (Fan, 2012). Through the role that platform design plays in shaping cultural norms, which has been identified as a more effective way of achieving HIV transmission prevention than flexing the punitive and surveillant arms of the state (Sunstein, 1996), intimate platform design provides opportunities to explore new ways of managing HIV (Fan, 2012). Indeed, a meta-analysis of HIV prevention efforts found that strategies that intervene in social meaning by shaping social norms, cultural practices, and individual attitudes were more effective in empowering behavioral change than appeals to fear (Albarracin, <em>et al.</em>, 2015).</p>
+
+<p>However, designing intimate platforms to account for HIV also presents serious challenges for social computing researchers and human-computer interaction (HCI) designers. As Handel and Shklovski pointed out: &ldquo;The minutiae of design decisions around profile options deserves particular attention because even the smallest changes can result in substantial differences for user interactions&rdquo; (Handel and Shklovski, 2012). In addition to concerns around how to best design for HIV, platforms, Grindr in particular, have already come under fire for sharing user HIV information with third parties (Singer, 2018). Moreover, designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the serious risk of re-entrenching the status quo and its incumbent inequalities and power relations (Bardzell, 2010). While designing for HIV presents opportunities to redress stigma and harm, researchers in HCI must understand that &ldquo;[i]t is not enough to have good intentions ... [we] must ground [our] efforts in clear political commitments and rigorous evaluations of the likely consequences&rdquo; (Green, 2018).</p>
+
+<p>From this comes the recognition that social computing designers and researchers seeking to design for disclosure cannot afford to ignore the ways that the lived experiences of people living with HIV are shaped by structural forces and, particularly, the reality of HIV criminalization and the State&rsquo;s role in conducting STD surveillance. Platforms, after all, do not exist in a separate sphere from material reality: a redesign that eases HIV disclosure from user-to-user might also involve the storing of disclosure data by the platform &mdash; data that can then be accessed, requisitioned, and co-opted by arms of the state. In line with Jackson, <em>et al.&rsquo;s</em> call for the social computing community to address the structural and lived consequences of law and policy that &ldquo;establish the very terrain on which design and practice can be conceived, articulated, and imagined &mdash; and upon which battles of accountability are inevitably waged&rdquo; <a name="2a"></a>[<a href="#2">2</a>], we wish to undertake a critical investigation of HIV disclosure in dating and hookup platforms. This involves not just investigating the implications of disclosure in a person-to-person sense, but also how platform design is shaped by legal and administrative regulation and how the risks of disclosure might open users up to systems of surveillance, stigma, and criminalization. We do so by using a range of platforms in an effort to gain a wide view, and to practice prefigurative politics &mdash; minimizing our assumptions about the &ldquo;type&rdquo; of people at risk of HIV infection and/or surveillance.</p>
+
+<p>To do this, we analyze platform&rsquo;s consequences for HIV through the lens of user-to-user interactions, exploring the ways that design renders users visible and vulnerable to wider carceral and surveillance infrastructures, and the way that design shapes (and is shaped) by HIV&rsquo;s legal status. We ground our discussion in a content analysis of 50 popular, mobile dating and hookup platforms, coding for design and policy choices related to HIV disclosure, prevention, destigmatization, surveillance, privacy, and criminalization. Through this, we reveal that many platforms fail to account for HIV, and of those that do, many neglect to attend to the downstream consequences of HIV disclosure and the data produced by it, while exacerbating the social, racial, and class stereotypes associated with the condition.</p>
+
+<p>As scholars and designers consider how platform design might aid HIV prevention and destigmatization (Hutson, <em>et al.</em>, 2018; Albury, <em>et al.</em>, 2017; Wohlfeiler, <em>et al.</em>, 2013; Rosser, <em>et al.</em>, 2011), we aim to grapple with the structural and ethical implications of designing for HIV, particularly how intimate platform design might aid and abet the decriminalization and surveillance of HIV (Sykes, <em>et al.</em>, 2016; Kazatchkine, <em>et al.</em>, 2015; Perone, 2013; Gagnon, 2012; J&uuml;rgens, <em>et al.</em>, 2009). Drawing on principles from social justice-oriented design to investigate controversies and design possibilities in intimate platforms, we attempt to articulate an approach to intimate platform design that not only works to reduce the stigma of user disclosure, but also works to contest historic and present power imbalances and injustices between users, platforms, and the state.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p2"></a>Methods</strong></p>
+
+<p>Using a directed content analysis (Hsieh and Shannon, 2005), we reviewed 50 existing mobile dating and hookup platforms. Content analyses have proven effective in understanding platform design and governance and the ways design practices mediate user-to-user bias and discrimination (Levy and Barocas, 2018; Hutson, <em>et al.</em>, 2018). We set out to capture a landscape of popular platforms and selected the first 50 dating and hook up platforms in the top 200 grossing social networking applications in the United States on the iOS App Store in March of 2019. <a href="#fig1">Figure 1</a> lists the platforms selected in alphabetical order.</p>
+
+<p>&nbsp;</p>
+<a name="fig1"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71623" alt="50 dating and hookup platforms surveyed"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 1:</strong> The 50 dating and hookup platforms surveyed.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>Utilizing the walkthrough method (Light, <em>et al.</em>, 2018), we explored each platform&rsquo;s HIV-related user experience. We examined design features on each of these platforms, systematically documenting design choices, policies, and informational interventions that mediate HIV. Building upon previous work around intimate platforms and HIV, we coded each of the 50 intimate platforms based on the following dimensions:</p>
+
+<table width="70%" align="center"><tr><td><p>Prevention</p>
+<ul><li>Whether the app allows same-sex connections</li>
+<li>Whether a user can disclose HIV/sexually transmitted infection (STI) status (Warner, <em>et al.</em>, 2018)</li>
+<li>If they can disclose, what are the options? (Warner, <em>et al.</em>, 2018)</li>
+<li>Whether a user can search for or filter out users with HIV/STIs? (Hutson, <em>et al.</em>, 2018)</li>
+<li>Whether the platforms provide informational interventions with respect to HIV/STI prevention (Wang, <em>et al.</em>, 2019)</li></ul>
+<p>Stigma reduction</p>
+<ul><li>Whether a user can identify as having HIV/STI (<em>e.g.</em>, &ldquo;Poz&rdquo;, etc.)</li>
+<li>Whether a user can indicate interest in or acceptance of people living with HIV/STIs (<em>e.g.</em> outward presentation, separate from filtering, not simply via profile text) (Hutson, <em>et al.</em>, 2018)</li></ul>
+<p>Policies</p>
+<ul><li>Whether the platform engages HIV/STIs in their policies (terms of service, privacy, and community policies, etc.) (Jackson, <em>et al.</em>, 2014)</li></ul></td></tr></table>
+
+<p>For ethical reasons, we did not interact with other users, only observed features, and deleted our accounts once data were collected when possible (not all platforms allowed for account deletion). The design and policy choices described and discussed below are not intended as an endorsement of any particular design intervention for managing HIV. Rather, we aim to capture the various ways intimate platforms currently manage and mediate HIV among users and how those choices map onto extant legal and surveillant infrastructures. Additionally, we highlight two limitations in how we chose which platforms to analyze. First, it is possible for a hook-up platform to not have an accompanying mobile app, meaning our selection of platforms from the iOS app store will have invariably missed Web site-based platforms. Second, we may have overlooked platforms that are more niche or community-specific, yet not as popular in the broader platform marketplace (<em>i.e.</em>, not within the top grossing platforms).</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p3"></a>Findings</strong></p>
+
+<p>&nbsp;</p>
+<a name="fig2"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71624" alt="A visualization of our content analysis"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 2:</strong> A visualization of our content analysis.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Design features</strong></em></p>
+
+<p>Out of the 50 intimate platforms we examined, 13 were meant specifically for queer communities (11 specifically targeted at gay and bisexual men and two at lesbian and bisexual women). None of the platforms we reviewed were distinctly designed for trans people. The remaining 34 platforms were for general audiences, catering to heterosexual and homosexual connections, and three platforms were exclusively for heterosexual connections (eHarmony, Uniform Dating, and Waplog). Only queer-specific platforms (six) had explicit HIV disclosure options and allowed for filtering or searching based on HIV status. <a href="#fig3">Figure 3</a> shows the disclosure options for each platform. Growlr, Taimi, and Scruff allowed users to indicate that they were accepting of people living with HIV. Grindr, Hornet, Mr. X, Xtremboy, and Scruff, five platforms all of which are queer-specific, provide informational interventions with respect to HIV/STI prevention (See <a href="#fig4">Figure 4</a> for examples). Eight dating apps mentioned HIV in their policies (five queer-specific, three general). Four dating apps allowed users to identify with an HIV/STI-relevant identity category, often labeled &ldquo;poz&rdquo;. Please see <a href="#fig2">Figure 2</a> for a visualization of our content analysis.</p>
+
+<p>&nbsp;</p>
+<a name="fig3"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71625" alt="Disclosure options"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 3:</strong> Disclosure options.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>&nbsp;</p>
+<a name="fig4"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71626" alt="Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right)"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 4:</strong> Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right).</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Policies</strong></em></p>
+
+<p>None of the 50 intimate platforms we reviewed explicitly mention HIV in their terms of service. Four platforms expressly discuss HIV in their privacy policies (Grindr, Hornet, Scruff, and Mr. X), and four platforms mention HIV in platform safety policies (Planet Romeo, Tinder, BlackPeopleMeet, and Our Time). No platform engaged any of the legal implications of HIV. No platform engaged the public health surveillance of HIV.</p>
+
+<p>Of the four platforms that expressly engage HIV in their privacy policies (Grindr, Hornet, Mr. X, Scruff), only two (Grindr &amp; Hornet) explicitly prohibit sharing HIV information with third parties. By disclosing one&rsquo;s HIV status on Mr. X and Scruff, users consent to the platform&rsquo;s processing of that information. Grindr warns that HIV status disclosure on a user profile is effectively public information, however the platform does not share HIV status information with third party tracking, analytics, and advertising companies or service providers. Of all the platforms reviewed, Grindr&rsquo;s privacy policy is the only one that devotes an entire section to HIV status, which is not particularly surprising given Grindr&rsquo;s involvement in multiple controversies around sharing HIV information with third parties (Fitzsimons, 2019; Singer, 2018):</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;HIV Status. At the recommendation of HIV prevention experts and the community of Grindr users, we give you the option of publishing your health characteristics, such as your HIV status, in your Grindr community profile. Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App. As a result, you should carefully consider whether you want to disclose your HIV status. We do not share HIV status with any third-party service advertisers or third-party service providers other than companies that host data on our behalf (<em>e.g.</em>, Amazon Cloud). In addition, we do not use HIV status for advertising purposes and do not share this information with advertisers.&rdquo;</td></tr></table>
+
+<p> According to Hornet&rsquo;s privacy policies, they &ldquo;[do] not share any HIV status information with third parties unless required to do so by law&rdquo;. Of the 50 platforms reviewed, Hornet was the only one to enable users to opt into receiving &ldquo;in-app reminders to undergo HIV tests and receive information on the location of nearby testing centers.&rdquo; On Hornet, a user&rsquo;s HIV status &ldquo;is only searchable by users who have defined themselves as HIV positive.&rdquo; Scruff&rsquo;s privacy policy highlights that &ldquo;there is no requirement to&rdquo; provide them with &ldquo;health details and whether part of the POZ (HIV positive) community (for example, in creating or updating your profile),&rdquo; and that by doing so, users &ldquo;are explicitly consenting to [Scruff&rsquo;s] processing of [their] information.&rdquo; Mr. X&rsquo;s privacy policy notes that HIV status information &ldquo;may be considered &lsquo;special&rsquo; or &lsquo;sensitive&rsquo; in certain jurisdictions,&rdquo; and that by providing this information, users &ldquo;consent to [Mr. X&rsquo;s] processing that information&rdquo;.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p4"></a>Discussion</strong></p>
+
+<p><em><strong>Prevention</strong></em></p>
+
+<p>Platforms can act as an interventional tool to improve access to and perceptions of care for people living with HIV. Examples of HIV/STI prevention include a &ldquo;Last Tested Date&rdquo; section on a user&rsquo;s profile and reminders to get tested for HIV/STIs. Some current platforms engage with HIV more critically by acknowledging that HIV is an issue its users should be aware through specific features. Hornet, for instance, provides its users with HIV-relevant educational material and resources for getting tested. Hornet also limits searching based on HIV status to people who themselves have chosen the HIV positive option, thereby limiting the possibility of HIV status-based discrimination. Hornet and Grindr can also provide reminders for users to get tested. Scruff allows users to choose from sex safety practices that include using condoms, using pre-exposure prophylaxis (PrEP), and/or treatment as prevention (Warner, <em>et al.</em>, 2019).</p>
+
+<p>Due in large part to the history of HIV&rsquo;s recognition as a medical condition, HIV has been generally classified as a &ldquo;gay man&rsquo;s problem&rdquo; in North America &mdash; frequently (albeit almost as frequently unmarked) a white, cisgender gay man&rsquo;s problem. This classification and framing acted to both separate normative society from the stigma associated with the condition and provide an avenue for activism by associating it with the most &ldquo;acceptable&rdquo; queer bodies: masculine, middle class, cisgender and white (Epstein, 1996).</p>
+
+<p>HIV has disproportionately impacted gay communities specifically, but transmission does not fit a neat pattern of being binarized tidily along sexuality. It is disproportionately prevalent in communities of color, appears in heterosexual relationships and lives, and risk of transmission follows societal vulnerability and marginalization &mdash; transgender women, particularly transgender women of color, are particularly overrepresented in diagnosis rates (Clark, <em>et al.</em>, 2017). While the partial normalization of HIV &mdash; holding it outside the direct concerns of white, cisgender, heterosexual people, but embodying it in people who look &ldquo;just like them&rdquo; &mdash; may have aided in assembling efforts to address the condition, the assumptions that it has created in who is at risk and who &ldquo;counts&rdquo; have been tremendous. One only has to look at the ethnographic work of Vivianne Namaste, who highlights how Montreal&rsquo;s history of HIV, its recognition, and efforts at its prevention simultaneously elided the incidence rate amongst the Haitian community (which at one point had 65 percent of reported AIDS cases) and lacked any advice or conception of susceptibility for women, particularly heterosexual or bisexual women (Namaste, 2015).</p>
+
+<p>Our platform analysis demonstrates that these same assumptions about vulnerability and risk are present in the design of intimate platforms. Generic platforms (<em>i.e.</em>, those that cater to non-queer or broader, more heteronormative audiences) entirely do not consider, engage, or design for HIV while the platforms for queer &mdash; and more specifically gay men &mdash; do. Even within the group of 13 queer-specific applications, neither of the two queer women-specific apps allowed for HIV disclosure, even though 23 percent of people with HIV in the U.S. are women (Center for Disease Control and Prevention, 2019c). Most, if not all, platforms dedicated to general audiences do nothing when it comes to HIV prevention, contributing to the knowledge gap for general audiences on sexual health, HIV-specific, and more. Because general audiences can go through online dating experiences without encountering HIV materials, platform designers allow these users to falsely believe that their sexual lives are excluded from important matters of sexual health.</p>
+
+<p>Our intent is not to suggest that HIV should be narrated as a problem for everyone; to ignore sexuality in the impact and risk of HIV transmission is an ahistorical mistake. But treating it <em>solely</em> as a &ldquo;gay man&rsquo;s problem&rdquo; simultaneously elides differences in vulnerability and risk within gay communities and perpetuates the silence around transmission for other populations, particularly trans women of color and/or heterosexual people. In other words, it is not that HIV is not frequently a risk for gay communities, but that drawing a line between sexuality and risk perpetuates the more nuanced disparities in risk and the discourse that HIV transmission is not something anyone else has to think about.</p>
+
+<p>Platforms can and have implemented prevention efforts through Last Tested Date and Testing Reminders features. Doing so more ubiquitously, rather than solely on gay male-specific platforms, may be helpful in normalizing prevention efforts like getting tested regularly and knowing one&rsquo;s status. Through opportunities like this, platform designers have the opportunity to promote HIV/STI prevention and care &mdash; an opportunity that is valuable precisely for its ability to normalize prevention efforts. This is not to say that such features are not without risks, particularly with regards to state surveillance, intervention and structural forces, which is our next topic of concern and discussion.</p>
+
+<p><em><strong>Stigma &amp; disclosure</strong></em></p>
+
+<p>Designing for HIV is not as simple as including disclosure fields and status-based filtering or not. Allowing disclosure and filtering can protect people living with HIV from negative and sometimes harmful interactions, help filter out people who might discriminate against them, fight HIV stigma, and promote much-needed awareness. However, disclosure and filtering can also lead to discriminatory practices (Hutson, <em>et al.</em>, 2018), have potential for privacy unraveling (Warner, <em>et al.</em>, 2018), and contribute to surveillance (Fan, 2012, 2011).</p>
+
+<p>De-stigmatizing HIV offers designers an opportunity to engage in the structural dimensions of how HIV operates in social life and can possibly allow us to better tap into social norms around the condition that ultimately improve other outcomes. For instance, humanizing people living with HIV could lead to more people getting tested, being open about their status, and being communicative with their sexual partners. Platforms have the power to shift social norms and destigmatize HIV at scale due to their pervasiveness throughout modern connections, but designers need to contest the ethical implications of de-stigmatizing HIV on these platforms, especially through current features such as HIV-status-based filtering and disclosure options.</p>
+
+<p>Filtering and searching tools based on HIV status can be instrumental for people living with HIV to find others who are either seropositive or otherwise accepting of seropositive people. Additionally, filtering out those who might discriminate against them for their HIV status anyways allows people living with HIV to avoid awkward or even violent interactions with users who harbor problematic beliefs about people living with HIV. Conversely, HIV status-based filtering and searching tools have representational and allocational harms. First, it represents that there are particularly psycho-social characteristics incumbent with HIV status. These stereotypes play out in a variety of different ways such as the framing that people living with HIV engage in &ldquo;risky&rdquo; sexual behavior. Second, HIV status-based filtering can be used to structurally exclude HIV positive users from the opportunity to engage in intimate affiliation (Hutson, <em>et al.</em>, 2018). Platforms can and do provide users the ability to screen out other users who identify as &ldquo;Poz&rdquo; or disclose their HIV status. Not only do these design features facilitate exclusion, they may disincentivize HIV related disclosures to the extent that such disclosures can be weaponized by other users to exclude them as potential intimate affiliates.</p>
+
+<p>Disclosure fields as a way to de-stigmatize HIV are similarly complicated in that they can inhibit and benefit people living with HIV. For one, encouraging users to disclose, regardless of their status, can create a healthier culture and discussion around HIV, possibly making talking about one&rsquo;s status an acceptable and common practice of intimate engagement. On the other hand, disclosure can be used for a variety of problematic ends that harm seropositive users. Other users may discriminate against users who have disclosed their HIV status, choosing to ignore or disengage with them entirely. Disclosure may have unintended consequences and lead to more personal and violent outcomes. Due to laws in particular jurisdictions, failure to disclose one&rsquo;s status to a partner can lead to prosecution and potentially incarceration. People living with HIV might also face physical and emotional threats for disclosing their status either publicly or privately.</p>
+
+<p>Due to these complexities, designers of dating platforms must face the question of how can we de-stigmatize HIV without creating additional obstacles for people living with HIV? Platforms need to critically unpack the possible consequences of well-intentioned design choices, including HIV status-based filtering and HIV status disclosure fields. Of the platforms we reviewed, Scruff is the only one to provide for HIV disclosure without using an express &ldquo;HIV status&rdquo; field, allowing instead two disclosure options, Poz and Treatment as Prevention. &ldquo;Poz&rdquo; constitutes an association and identification with a community (<em>e.g.</em>, &ldquo;I am a bear, daddy, poz&rdquo;), while &ldquo;Treatment as Prevention,&rdquo; signals antiretroviral therapy (<em>i.e.</em>, use of HIV medicines to treat HIV infection) and constitutes a link to sex safety practices.</p>
+
+<p><em><strong>Surveillance &amp; criminalization</strong></em></p>
+
+<p>At the same time, given the questions of structural power and surveillance built into these platforms, we are leery of treating disclosure option design as the site of de-stigmatization and justice. Questions of privacy and stigma go wider than micro-interactions and touch on how HIV is seen and responded to societally and administratively. The dominant responses to HIV/AIDS &ldquo;center on adjusting the traditional levers of criminal and tort law, and of public health law, with its surveillance and disciplinary regimes that concentrate information and decision-making in the state&rdquo; <a name="3a"></a>[<a href="#3">3</a>]. Indeed, HIV continues to function as a &ldquo;vector for the exercise of state power and the invention of novel logics and techniques of government,&rdquo; whereby &ldquo;[i]nfection with HIV virtually guarantees that a citizen will need to interact, either beneficently or coercively, with one or more state bureaucracies&rdquo; <a name="4a"></a>[<a href="#4">4</a>].</p>
+
+<p>The broader ecosystem of intimate platforms that we observed provided virtually no HIV-specific privacy information or protections for users living with HIV. Overall, both the platforms that account for HIV in their privacy policies and the platforms that enable disclosure but do not account for HIV in their privacy policies continue to place the risks and burden of surveillance, privacy, and disclosure on users with HIV. Grindr&rsquo;s &ldquo;HIV Status&rdquo; policy puts it clearly: &ldquo;Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App.&rdquo; By surfacing this as a risk we do not mean to suggest that users lack agency &mdash; merely that the agency to choose between a range of options can be influenced by how those options are bounded and made available in addition to the affordances and norms that platform design provides. That a user makes information public does not mean that &ldquo;consumable by all&rdquo; is the framework of disclosure that they have in mind (Wittkower, 2016).</p>
+
+<p>While some intimate platforms are working towards promoting HIV disclosure, prevention, and de-stigmatization, they are also failing to grapple with privacy implications of HIV and their responsibility in ensuring it. People living with HIV are already vulnerable and bear the weight of HIV disclosure&rsquo;s downstream consequences. By continuing to offload the burdens and risk on those with HIV, platforms are likely contributing to issues of nondisclosure as well as HIV testing. Research shows that privacy fears can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002).</p>
+
+<p>In this context, proposals to design for HIV disclosure that do not consider the wider structural implications of surveillance are concerning. The focus of most research into HIV and online dating in HCI on micro-interactions and enabling trust and certainty between users elides the implications that providing this data to a platform outside user control has and the way that this data can be used to control. This is not an abstract risk; just this year, Grindr (the platform under study) has been the subject of scrutiny by the U.S. government over its Chinese ownership, due to fears that the Chinese government might access and copy Grindr&rsquo;s data around HIV disclosure for the purpose of domestic policing and control (Fitzsimons, 2019). If we are designing to enable HIV disclosure, are we working to improve stigma associated with disclosure &mdash; or are we enabling new forms of control and surveillance?</p>
+
+<p>In the United States today, intimate platforms operate within 29 states that have HIV criminal laws, which include laws that target sex/nondisclosure of HIV-positive status, sex work, exposure to bodily fluids, needle-sharing, sex work, and blood/organ/semen donation, nine states that have sentencing enhancements applicable to people living with HIV who commit an underlying assault crime, and 24 states that have prosecuted people living with HIV under non-HIV-specific general criminal laws (Center for HIV Law &amp; Policy, 2019). Here, the design of intimate platforms cannot be removed from the reality of laws that criminalize HIV, particularly HIV non-disclosure.</p>
+
+<p>People living with HIV in the U.S. with HIV-specific criminal laws must disclose their HIV status to sexual partners. Generally, &ldquo;disclosure and consent&rdquo; is an affirmative defense <a name="5a"></a>[<a href="#5">5</a>], whereby a person can avoid criminal and civil liability if they disclose their serostatus <a name="6a"></a>[<a href="#6">6</a>] and their sexual partner voluntarily consents to sexual activity with knowledge of that serostatus <a name="7a"></a>[<a href="#7">7</a>]. Many of the laws that criminalize HIV non-disclosure do not provide guidance as to what methods of disclosure and consent are enough to avoid prosecution and conviction (McCallum, 2014). No court or legislature has affirmatively stated whether verbal disclosure and consent are necessary under criminal HIV transmission statutes. Furthermore, non-verbal communication online create uncertainty as to whether there is sufficient disclosure and consent to remove criminal liability for HIV-positive individuals. Both disclosure and consent can be ambiguous or misunderstood, a problem that is complicated by the design and widespread use of mobile dating and hookup platforms.</p>
+
+<p>It remains unclear what constitutes appropriate disclosure and informed consent in the context of intimate platforms, such as HIV disclosure fields on user profiles or other communication in a profile&rsquo;s free form text sections (<em>e.g.</em>, &ldquo;+&rdquo; &ldquo;Poz&rdquo;, &ldquo;undetectable&rdquo;). Although some intimate platforms afford HIV-positive users the ability to disclose their serostatus in new ways, no court or legislature in the U.S. has answered whether disclosing HIV status on an intimate platform is enough to achieve informed consent and avoid criminal and civil liability. Yet many people living with HIV also use records of conversations on intimate platforms as a means of protection. For example, people disclose their status and use that record as a way to protect themselves from future allegations of non-disclosure. This ambiguity and incumbent legal risk places significant responsibility and pressure on HIV users. Research shows that fears around rejection, self-blame, criminalization, and privacy can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002). Privacy concerns around HIV disclosure are often associated with the need to protect one&rsquo;s self from HIV related stigma (Adam, <em>et al.</em>, 2011; Serovich and Mosack, 2006; Greene, <em>et al.</em>, 2003). As more and more people use platforms to meet intimate partners, the historical failure of HIV criminalization law to understand how disclosure and consent are negotiated in practice becomes all the more apparent.</p>
+
+<p>It might seem from this that designers and developers are trapped in an impossible situation &mdash; disclosure to protect users simultaneously produces the possibility of structural harms for those disclosing. While we urge designers to take both needs seriously, we do not consider it impossible; in fact, there is a range of work within queer theory and technology that not only articulates this tension of privacy, disclosure and the reuse of data but suggests queer forms of resistance to it. Writing more broadly, Brian Schram highlights the way that the increasing possibilities of &ldquo;big data&rdquo; and its attendant surveillance structures &ldquo;constitute an undoing of Queerness as a radical political injection&rdquo; <a name="8a"></a>[<a href="#8">8</a>], advocating a politics of <em>melancholia</em> that features a haunting of archives: an insertion of the dead weight of our collective memory as Queer persons into the growing catalog of our digital information. In other words, Schram suggests the deliberate incorporation of masses of false data, profiles, and traces into data stores in order to render ambiguous the truth of any presence and provide cover for those queer persons existing within the platform(s) data highlights. What would this look like in the case of dating platforms? What are the possibilities raised by incorporating a deluge of false accounts, <em>doppelg&auml;ngers</em>, and doubles, not as a deception of the platform or its users, but against state forces examining the database?</p>
+
+<p>More broadly, we might see possibilities for the future through practices in the past. In how queer communities responded to HIV disclosure and protection protocols during the 1980s and 1990s, David Halperin has articulated the way that gay communities worked to articulate norms that balanced risks, trust, and vulnerability in the absence of structural norms, that &ldquo;it is gay men themselves who have continued to define, and to redefine, the limits of safety through an ongoing history of sexual experimentation and mutual consultation, and who have thereby produced, over time, workable compromises and pragmatic solutions that balance safety and risk&rdquo; <a name="9a"></a>[<a href="#9">9</a>]. Rather than taking universalized, top-down approaches to platform design for all, we might instead seek to work up and to create a diverse range of spaces that challenge the ease of surveillance built into large-scale platforms and afford individual users more agency in establishing those compromises and solutions and engaging in that consultation.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p5"></a>Conclusion</strong></p>
+
+<p>As HCI researchers and designers, we continue to push the boundaries of what is technologically possible but doing so requires us to first ask whether platform design is even an appropriate intervention in a given situation (Keyes, <em>et al.</em>, 2019; Baumer and Silberman, 2011; Suchman, 2011). The current model of platform design for HIV cannot continue, as it is too closely tied to the collection and commodification of highly sensitive personal data. However, reimagining intimate platform design provides the social computing community an opportunity to intervene in the social norms around HIV and HIV disclosure in a manner that could unburden the weight of criminalization without centralizing the surveillant arms of the state.</p>
+
+<p>We envision a future of dating platforms that does not force people living with HIV to sacrifice surveillance for intimate experiences. Because of their entanglements with sex and romance, intimate platforms need to take on more responsibility in the sexual health and data privacy of their users. Drawing from our analysis and our own lived experiences, we recommend platform-level changes, changes in platform, and mechanisms to prevent platforms from knowing their users&rsquo; statuses. First, platforms should make explicit to their users the consequences of storing sensitive, personal information like HIV status and their documentation processes. Next, they should also implement policies that manage how data are stored when users delete their accounts and protect these data from third-party consumers. Finally, ownership of user&rsquo;s data should belong to the users themselves, rather than the platforms. Users should be able to pass along their information to other users without the platforms tracking it.</p>
+
+<p>HIV is a medical condition, but its eradication requires not just technical, or even sociotechnical, but socio<em>political</em> solutions. Indeed, the ways in which designers and policy-makers frame HIV is an inherently political decision, one that will impose the contours and boundaries of our response. The social computing community cannot do nothing, but it also must resist the desire to do everything. Designing user interfaces and platform policies to account for HIV will require a rigorous analysis of possible outcomes and consequences as well as a bedrock commitment to centering the voices and experiences of those impacted by HIV and the state&rsquo;s responses to it. Our commitments must account for the ways pathology and power intertwine to subjugate and otherize impacted communities at home and abroad.</p>
+
+<p>Designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the risk of re-entrenching the status quo and its incumbent inequalities and power relations (Dombrowski, <em>et al.</em>, 2016; Light, 2011; Irani, <em>et al.</em>, 2010; Bardzell, 2010). The social computing community must ground its efforts to design for HIV in clear political commitments to decriminalizing HIV and decentralizing power and information from the state. We must strive to unburden the weight of surveillance and incarceration on vulnerable and marginalized communities and work towards offloading the significant social and legal risks and pressures for people living with HIV. Moreover, our commitment to designing for HIV must not exclude nor obfuscate our capacity for direct action within and outside of the realms of design and research. This means fighting for the rights, dignity, and safety of people living with HIV in the streets and in the halls of local, national, and international political, legislative, and executive bodies.</p>
+
+<p>Our instinctual response to the failed and violent efforts of HIV criminalization and surveillance should not be &ldquo;there&rsquo;s an app for that,&rdquo; but rather &ldquo;there&rsquo;s a zap for that!&rdquo;. That is, the practice of designing for people with HIV should be a &ldquo;critical technical practice&rdquo; (Agre, 1997), undertaken with a mindset that sits uneasily between and is cognizant of both individual and structural power and consequence. Pioneered by the American gay liberation movement, a zap or &ldquo;zap action&rdquo; is a political action of direct and persistent public confrontation. Whether shouting down public figures or smashing pies into the faces of evangelicals, zaps aim to disrupt and disturb persons and institutions of authority to effect change (Cohen, 2018). In the words of AIDS Coalition to Unleash Power&rsquo;s (ACT UP) &ldquo;New Member Packet&rdquo;:</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;Zaps are a method for ACT UP members to register their disapproval of and anger toward the zap target. Zaps usually have more specific targets than actions. Because of this focus, numerous zapping techniques have been developed. ACT UP zaps individuals or organizations by: sending postcards or letters; invading offices and distributing fact sheets; sending (lots and lots of) faxes; picketing; outraged (and sometimes outrageous) phone calls. The more zappers who zap the zappee the better the zap.&rdquo;</td></tr></table>
+
+<p>A critical approach to designing for HIV requires the contesting of histories of incarceration, stigmatization, and surveillance and the ways in which the state exerts power and domination through its medicolegal levers of criminal law and public health surveillance. Intimate platform design should not only work to reduce the prevalence and stigma of HIV, but also to contest historic and present power imbalances and injustices between users, platforms, and the state. <img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71628" alt="End of article"></p>
+
+<p>&nbsp;</p>
+<a name="author"></a>
+<p><strong>About the authors</strong></p>
+
+<p><strong>Calvin Liang</strong> is a Ph.D. student in Human-Centered Design and Engineering Department at the University of Washington. His research broadly focuses on technology&rsquo;s role in and out of queerness, health, and queer health.<br>E-mail: cliang02 [at] uw [dot] edu</p>
+
+<p><strong>Jevan Alexander Hutson</strong>, living with HIV for four years, is a technology policy advocate, human-computer interaction researcher, and J.D. candidate at the University of Washington School of Law. His research interests center on issues of technology, law, and social life, with a particular focus on intimate/sexual computing.<br>E-mail: jevanh [at] uw [dot] edu</p>
+
+<p><strong>Os Keyes</strong> is a Ph.D. student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.<br>E-mail: okeyes [at] uw [dot] edu</p>
+
+<p>&nbsp;</p>
+<p><strong>Acknowledgements</strong></p>
+
+<p>We dedicate this paper to the radical history of the AIDS Coalition to Unleash Power (ACT UP) and to all of the souls we&rsquo;ve lost and continue to lose to HIV/AIDS. We would like to thank Mary Fan, Sean Munson, and Julie Kientz for valuable conversations and feedback, and Margret Wander and Margaret Hopkins for their ongoing care and support. This research was partially funded by a Microsoft Ada Lovelace Fellowship.</p>
+
+<p>&nbsp;</p>
+<p><strong>Notes</strong></p>
+
+<p><a name="1"></a><a href="#1a">1.</a> Halperin and Hoppe, 2017, p. 349.</p>
+
+<p><a name="2"></a><a href="#2a">2.</a> Jackson, <em>et al.</em>, 2014, p. 596.</p>
+
+<p><a name="3"></a><a href="#3a">3.</a> Fan, 2011, p. 36.</p>
+
+<p><a name="4"></a><a href="#4a">4.</a> Halperin and Hoppe, 2017, p. 255.</p>
+
+<p><a name="5"></a><a href="#5a">5.</a> See FLA. STAT. ANN. &sect; 775.0877 (2017) (&ldquo;[I]t is an affirmative defense to a charge of violating this section that the person exposed knew that the offender was infected with HIV, knew that the action being taken could result in transmission of the HIV infection, and consented to the action voluntarily with that knowledge.&rdquo;). See also <a href="http://www.hivlawandpolicy.org/states/florida">http://www.hivlawandpolicy.org/states/florida</a>.</p>
+
+<p><a name="6"></a><a href="#6a">6.</a> Serostatus is defined as: &ldquo;The state of either having or not having detectable antibodies against a specific antigen, as measured by a blood test (serologic test). For example, HIV seropositive means that a person has detectable antibodies to HIV; seronegative means that a person does not have detectable HIV antibodies.&rdquo; U.S. Department of Health &amp; Human Services, Education Materials, AIDSINFO, at <a href="https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus" target="_blank">https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus</a>, accessed 30 August 2019.</p>
+
+<p><a name="7"></a><a href="#7a">7.</a> Lehman, <em>et al.</em>, 2014, p. 1,101.</p>
+
+<p><a name="8"></a><a href="#8a">8.</a> Schram, 2019, p. 611.</p>
+
+<p><a name="9"></a><a href="#9a">9.</a> Halperin, 2015, p. 207.</p>
+
+<p>&nbsp;</p>
+<p><strong>References</strong></p>
+
+<p>Barry D. Adam, Richard Elliott, Patrice Corriveau, and Ken English, 2014. &ldquo;Impacts of criminalization on the everyday lives of people living with HIV in Canada,&rdquo; <em>Sexuality Research and Social Policy</em>, volume 11, number 1, pp. 39&ndash;49.<br>doi: <a href="https://doi.org/10.1007/s13178-013-0131-8" target="_blank">https://doi.org/10.1007/s13178-013-0131-8</a>, accessed 5 September 2020.</p>
+
+<p>Barry D. Adam, James Murray, Suzanne Ross, Jason Oliver, Stephen G. Lincoln, and Vicki Rynard, 2011. &ldquo;Hivstigma.com, an innovative Web-supported stigma reduction intervention for gay and bisexual men,&rdquo; <em>Health Education Research</em>, volume 26, number 5. pp. 795&ndash;807.<br>doi: <a href="https://doi.org/10.1093/her/cyq078" target="_blank">https://doi.org/10.1093/her/cyq078</a>, accessed 5 September 2020.</p>
+
+<p>Philip E. Agre, 1997. &ldquo;Toward a critical technical practice: Lessons learned in trying to reform AI,&rdquo; In: Geof Bowker, Les Gasser, Leigh Star, and Bill Turner (editors). <em>Bridging the great divide: Social science, technical systems, and cooperative work</em>. Mahwah, N.J.: Erlbaum.</p>
+
+<p>Anonymous, 2000. &ldquo;Name brands: The effects of intrusive HIV legislation on high-risk demographic groups,&rdquo; <em>Harvard Law Review</em>, volume 113, number 8, pp. 2,098&ndash;2,113.<br>doi: <a href="https://doi.org/10.2307/1342321" target="_blank">https://doi.org/10.2307/1342321</a>, accessed 5 September 2020.</p>
+
+<p>Taunya Lovell Banks, 1989. &ldquo;Women and AIDS &mdash; Racism, sexism, and classism,&rdquo; <em>New York University Review of Law &amp; Social Change</em>, volume 17, pp. 351&ndash;385, and at <a href="ttps://digitalcommons.law.umaryland.edu/fac_pubs/328" target="_blank">ttps://digitalcommons.law.umaryland.edu/fac_pubs/328</a>, accessed 5 September 2020.</p>
+
+<p>Shaowen Bardzell, 2010. &ldquo;Feminist HCI: Taking stock and outlining an agenda for design,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,301&ndash;1,310.<br>doi: <a href="https://doi.org/10.1145/1753326.1753521" target="_blank">https://doi.org/10.1145/1753326.1753521</a>, accessed 5 September 2020.</p>
+
+<p>Fran&ccedil;oise Barr&eacute;Sinoussi, Salim S. Abdool Karim, Jan Albert, LindaGail Bekker, Chris Beyrer, Pedro Cahn, Alexandra Calmy, Beatriz Grinsztejn, Andrew Grulich, Adeeba Kamarulzaman, Nagalingeswaran Kumarasamy, Mona R. Loutfy, Kamal M. El Filali, Souleymane Mboup, Julio S.G. Montaner, Paula Munderi, Vadim Pokrovsky, AnneMieke Vandamme, Benjamin Young, and Peter GodfreyFaussett, 2018. &ldquo;Expert consensus statement on the science of HIV in the context of criminal law,&rdquo; <em>Journal of the International AIDS Society</em>, volume 21, number 7.<br>doi: <a href="https://doi.org/10.1002/jia2.25161" target="_blank">https://doi.org/10.1002/jia2.25161</a>, accessed 5 September 2020.</p>
+
+<p>Eric P.S. Baumer and M. Six Silberman, 2011. &ldquo;When the implication is not to design (technology),&rdquo; <em>CHI &rsquo;11: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 2,271&ndash;2,274.<br>doi: <a href="https://doi.org/10.1145/1978942.1979275" target="_blank">https://doi.org/10.1145/1978942.1979275</a>, accessed 5 September 2020.</p>
+
+<p>Allan M Brandt, 1987. <em>No magic bullet: A social history of venereal disease in the United States since 1880</em>. Expanded edition. Oxford: Oxford University Press.</p>
+
+<p>Scott Burris and Edwin Cameron, 2008. &ldquo;The case against criminalization of HIV transmission,&rdquo; <em>Journal of the American Medical Association</em>, volume 300, number 5, pp. 578&ndash;581.<br>doi: <a href="https://doi.org/10.1001/jama.300.5.578" target="_blank">https://doi.org/10.1001/jama.300.5.578</a>, accessed 5 September 2020.</p>
+
+<p>Center for Disease Control and Prevention, 2019a. &ldquo;HIV and STD criminal laws,&rdquo; at <a href="https://www.cdc.gov/hiv/policies/law/states/exposure.html" target="_blank">https://www.cdc.gov/hiv/policies/law/states/exposure.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019b. &ldquo;HIV surveillance reports,&rdquo; at <a href="https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html" target="_blank">https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019c. &ldquo;HIV and women,&rdquo; at <a href="https://www.cdc.gov/hiv/group/gender/women/" target="_blank">https://www.cdc.gov/hiv/group/gender/women/</a>, accessed 5 September 2020.</p>
+
+<p>Center for HIV Law &amp; Policy, 2019. &ldquo;HIV criminalization in The United States,&rdquo; at <a href="http://www.hivlawandpolicy.org/sourcebook" target="_blank">http://www.hivlawandpolicy.org/sourcebook</a>, accessed 2 February 2020.</p>
+
+<p>Hollie Clark, Aruna Surendera Babu, Ellen Weiss Wiewel, Jenevieve Opoku, and Nicole Crepaz, 2017. &ldquo;Diagnosed HIV infection in transgender adults and adolescents: Results from the National HIV Surveillance System, 2009&ndash;2014,&rdquo; <em>AIDS and Behavior</em>, volume 21 number 9, pp. 2,774&ndash;2,783.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1656-7" target="_blank">https://doi.org/10.1007/s10461-016-1656-7</a>, accessed 5 September 2020.</p>
+
+<p>Sascha Cohen, 2018. &ldquo;How gay activists challenged the politics of civility,&rdquo; <em>Smithsonian Magazine</em> (10 July), at <a href="https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/" target="_blank">https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2004. &ldquo;Reasons for HIV disclosure/nondisclosure in close relationships: Testing a model of HIVdisclosure decision making,&rdquo; <em>Journal of Social and Clinical Psychology</em>, volume 23, number 6, pp. 747&ndash;767.<br>doi: <a href="https://doi.org/10.1521/jscp.23.6.747.54804" target="_blank">https://doi.org/10.1521/jscp.23.6.747.54804</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2002. &ldquo;Perceived HIV-related stigma and HIV disclosure to relationship partners after finding out about the seropositive diagnosis,&rdquo; <em>Journal of Health Psychology</em>, volume 7, number 4, pp. 415&ndash;432.<br>doi: <a href="https://doi.org/10.1177/1359105302007004330" target="_blank">https://doi.org/10.1177/1359105302007004330</a>, accessed 5 September 2020.</p>
+
+<p>Lynn Dombrowski, Ellie Harmon, and Sarah Fox, 2016. &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments,&rdquo; <em>DIS &rsquo;16: Proceedings of the 2016 ACM Conference on Designing Interactive Systems</em>, pp. 656&ndash;671.<br>doi: <a href="https://doi.org/10.1145/2901790.2901861" target="_blank">https://doi.org/10.1145/2901790.2901861</a>, accessed 5 September 2020.</p>
+
+<p>Robert W. Eisinger, Carl W. Dieffenbach, and Anthony S. Fauci, 2019. &ldquo;HIV viral load and transmissibility of HIV infection: Undetectable equals untransmittable,&rdquo; <em>Journal of the American Medical Association</em>, volume 321, number 5, pp. 451&ndash;452.<br>doi: <a href="https://doi.org/10.1001/jama.2018.21167" target="_blank">https://doi.org/10.1001/jama.2018.21167</a>, accessed 5 September 2020.</p>
+
+<p>Richard Elliot, 2002. &ldquo;Criminal law, public health and HIV transmission: A policy options paper,&rdquo; <em>UNAIDS (Joint United Nations Programme on HIV/AIDS)</em>, at <a href="https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf" target="_blank">https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf</a>, accessed 5 September 2020.</p>
+
+<p>Elizabeth F. Emens, 2008. &ldquo;Intimate discrimination: The state&rsquo;s role in the accidents of sex and love,&rdquo; <em>Harvard Law Review</em>, volume 122, number 5, pp. 1,307&ndash;1,402.<br>doi: <a href="https://doi.org/10.2307/40379752" target="_blank">https://doi.org/10.2307/40379752</a>, accessed 5 September 2020.</p>
+
+<p>Steven Epstein, 1996. <em>Impure science: AIDS, activism, and the politics of knowledge</em>. Berkeley: University of California Press.</p>
+
+<p>Amy L. Fairchild, Ronald Bayer, and James Colgrove, with Daniel Wolfe, 2007. <em>Searching eyes: Privacy, the state, and disease surveillance in America</em>. Berkeley: University of California Press.</p>
+
+<p>Mary D. Fan, 2012. &ldquo;Decentralizing STD surveillance: Toward better informed sexual consent,&rdquo; <em>Yale Journal of Health Policy, Law, and Ethics</em>, volume 12, number 1, pp. 1&ndash;38.</p>
+
+<p>Mary D. Fan, 2011. &ldquo;Sex, privacy, and public health in a casual encounters culture,&rdquo; <em>University of California Davis Law Review</em>, volume 25, pp. 531&ndash;596.</p>
+
+<p>Tim Fitzsimons, 2019. &ldquo;Inside Grindr, fears that China wanted to access user data via HIV research,&rdquo; <em>NBC News</em> (2 April), at <a href="https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996" target="_blank">https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996</a>, accessed 5 September 2020.</p>
+
+<p>Chandra L. Ford, Kathryn D. Whetten, Susan A. Hall, Jay S. Kaufman, and Angela D. Thrasher, 2007. &ldquo;Black sexuality, social construction, and research targeting &lsquo;The Down Low&rsquo; (&lsquo;The DL&rsquo;),&rdquo; <em>Annals of Epidemiology</em>, volume 17, number 3, pp. 209&ndash;216.<br>doi: <a href="https://doi.org/10.1016/j.annepidem.2006.09.006" target="_blank">https://doi.org/10.1016/j.annepidem.2006.09.006</a>, accessed 5 September 2020.</p>
+
+<p>A.J. Fortin, 1995. &ldquo;AIDS, surveillance, and public policy,&rdquo; <em>Research in Law and Policy Studies</em>, volume 4, pp. 173&ndash;197.</p>
+
+<p>Marilou Gagnon, 2012. &ldquo;Toward a critical response to HIV criminalization: Remarks on advocacy and social justice,&rdquo; <em>Journal of the Association of Nurses in AIDS Care</em>, volume 23, number 1, pp. 11&ndash;15.<br>doi: <a href="https://doi.org/10.1016/j.jana.2011.08.012" target="_blank">https://doi.org/10.1016/j.jana.2011.08.012</a>, accessed 5 September 2020.</p>
+
+<p>Carol L. Galletly and Steven D. Pinkerton, 2006. &ldquo;Conflicting messages: How criminal HIV disclosure laws undermine public health efforts to control the spread of HIV,&rdquo; <em>AIDS and Behavior</em>, volume 10, number 5, pp. 451&ndash;461.<br>doi: <a href="https://doi.org/10.1007/s10461-006-9117-3" target="_blank">https://doi.org/10.1007/s10461-006-9117-3</a>, accessed 5 September 2020.</p>
+
+<p>C. Galletly, Z. Lazzarini, C. Sanders, and S.D. Pinkerton, 2014. &ldquo;Criminal HIV exposure laws: Moving forward,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp. 1,011&ndash;1,013.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0731-1" target="_blank">https://doi.org/10.1007/s10461-014-0731-1</a>, accessed 5 September 2020.</p>
+
+<p>Robert C. Gallo, 2006. &ldquo;A reflection on HIV/AIDS research after 25 years,&rdquo; <em>Retrovirology</em>, volume 3, article number 72.<br>doi: <a href="https://doi.org/10.1186/1742-4690-3-72" target="_blank">https://doi.org/10.1186/1742-4690-3-72</a>, accessed 5 September 2020.</p>
+
+<p>George Gallup, Jr. and Jim Castelli, 1987. &ldquo;Poll catalogs views on AIDS by religion,&rdquo; <em>Dallas Morning News</em> (27 September), p. 45A.</p>
+
+<p>Lawrence O. Gostin, Scott Burris, and Zita Lazzarini, 1999. &ldquo;The law and the public&rsquo;s health: A study of infectious disease law in the United States,&rdquo; <em>Columbia Law Review</em>, volume 99, number 1, pp. 59&ndash;128.</p>
+
+<p>Ben Green, 2018. &ldquo;Data science as political action: Grounding data science in a politics of justice,&rdquo; <em>arXiv</em>:1811.03435 (6 November), at <a href="https://arxiv.org/abs/1811.03435" target="_blank">https://arxiv.org/abs/1811.03435</a>, accessed 5 September 2020.</p>
+
+<p>Kathryn Greene, Valerian J. Derlega, Gust A. Yep, and Sandra Petronio, 2003. <em>Privacy and disclosure of HIV in interpersonal relationships: A sourcebook for researchers and practitioners</em>. Mahwah, N.J.: Lawrence Erlbaum Associates.</p>
+
+<p>David M. Halperin, 2015. &ldquo;The biopolitics of HIV prevention discourse,&rdquo; In: Vernon W. Cisney and Nicolae Morar (editors). <em>Biopower: Foucault and beyond</em>. Chicago: University of Chicago Press, pp. 199&ndash;227.</p>
+
+<p>David M. Halperin and Trevor Hoppe (editors), 2017. <em>The war on sex</em>. Durham, N.C.: Duke University Press.</p>
+
+<p>Mark J. Handel and Irina Shklovski, 2012. &ldquo;Disclosure, ambiguity and risk reduction in real-time dating sites,&rdquo; <em>GROUP &rsquo;12: Proceedings of the 17th ACM International Conference on Supporting Group Work</em>, pp. 175&ndash;178.<br>doi: <a href="https://doi.org/10.1145/2389176.2389203" target="_blank">https://doi.org/10.1145/2389176.2389203</a>, accessed 5 September 2020.</p>
+
+<p>Jean Hardy and Silvia Lindtner, 2017. &ldquo;Constructing a desiring user: Discourse, rurality, and design in location-based social networks,&rdquo; <em>CSCW &rsquo;17: Proceedings of the 2017 ACM Conference on Computer Supported Cooperative Work and Social Computing</em>, pp. 13&ndash;25.<br>doi: <a href="https://doi.org/10.1145/2998181.2998347" target="_blank">https://doi.org/10.1145/2998181.2998347</a>, accessed 5 September 2020.</p>
+
+<p>Dini Harsono, Carol L. Galletly, Elaine O&rsquo;Keefe, and Zita Lazzarini, 2017. &ldquo;Criminalization of HIV exposure: A review of empirical studies in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 21, no. 1, pp. 27&ndash;50.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1540-5" target="_blank">https://doi.org/10.1007/s10461-016-1540-5</a>, accessed 5 September 2020.</p>
+
+<p>Trevor Hoppe, 2018. <em>Punishing disease: HIV and the criminalization of sickness</em>. Berkeley: University of California Press.</p>
+
+<p>Hsiu-Fang Hsieh and Sarah E. Shannon, 2005. &ldquo;Three approaches to qualitative content analysis,&rdquo; <em>Qualitative Health Research</em>, volume 15, number 9, pp. 1,277&ndash;1,288.<br>doi: <a href="https://doi.org/10.1177/1049732305276687" target="_blank">https://doi.org/10.1177/1049732305276687</a>, accessed 5 September 2020.</p>
+
+<p>Jevan A. Hutson, Jessie G. Taft, Solon Barocas, and Karen Levy, 2018. &ldquo;Debiasing desire: Addressing bias &amp; discrimination on intimate platforms,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 73.<br>doi: <a href="https://doi.org/10.1145/3274342" target="_blank">https://doi.org/10.1145/3274342</a>, accessed 5 September 2020.</p>
+
+<p>Lilly Irani, Janet Vertesi, Paul Dourish, Kavita Philip, and Rebecca E. Grinter, 2010. &ldquo;Postcolonial computing: A lens on design and development,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,311&ndash;1,320.<br>doi: <a href="https://doi.org/10.1145/1753326.1753522" target="_blank">https://doi.org/10.1145/1753326.1753522</a>, accessed 5 September 2020.</p>
+
+<p>Steven J. Jackson, Tarleton Gillespie, and Sandy Payette, 2014. &ldquo;The policy knot: Re-integrating policy, practice and design in cscw studies of social computing,&rdquo; <em>CSCW &rsquo;14: Proceedings of the 17th ACM Conference on Computer Supported Cooperative Work &amp; Social Computing</em>, pp. 588&ndash;602.<br>doi: <a href="https://doi.org/10.1145/2531602.2531674" target="_blank">https://doi.org/10.1145/2531602.2531674</a>, accessed 5 September 2020.</p>
+
+<p>Paula C. Johnson, 1992. &ldquo;Silence equals death: The response to AIDS within communities of color,&rdquo; <em>University of Illinois Law Review</em>, volume 1992, pp. 1,075&ndash;1,083.</p>
+
+<p>Ralf J&uuml;rgens, Jonathan Cohen, Edwin Cameron, Scott Burris, Michaela Clayton, Richard Elliott, Richard Pearshouse, Anne Gathumbi, and Delme Cupido, 2009. &ldquo;Ten reasons to oppose the criminalization of HIV exposure or transmission,&rdquo; <em>Reproductive Health Matters</em>, volume 17, number 34, pp. 163&ndash;172.<br>doi: <a href="https://doi.org/10.1016/S0968-8080(09)34462-6" target="_blank">https://doi.org/10.1016/S0968-8080(09)34462-6</a>, accessed 5 September 2020.</p>
+
+<p>Gopinaath Kannabiran, Shaowen Bardzell, and Jeffrey Bardzell, 2012. &ldquo;Designing (for) desire: a critical study of technosexuality in HCI,&rdquo; <em>NordiCHI &rsquo;12: Proceedings of the Seventh Nordic Conference on Human-Computer Interaction: Making Sense Through Design</em>, pp. 655&ndash;664.<br>doi: <a href="https://doi.org/10.1145/2399016.2399116" target="_blank">https://doi.org/10.1145/2399016.2399116</a>, accessed 5 September 2020.</p>
+
+<p>C&eacute;cile Kazatchkine, Edwin Bernard, and Patrick Eba, 2015. &ldquo;Ending overly broad HIV criminalization: Canadian scientists and clinicians stand for justice,&rdquo; <em>Journal of the International AIDS Society</em>, volume 18, number 1, pp. 201&ndash;226.<br>doi: <a href="https://doi.org/10.7448/IAS.18.1.20126" target="_blank">https://doi.org/10.7448/IAS.18.1.20126</a>, accessed 5 September 2020.</p>
+
+<p>Os Keyes, Jevan Hutson, and Meredith Durbin, 2019. &ldquo;A mulching proposal: Analysing and improving an algorithmic system for turning the elderly into high-nutrient slurry,&rdquo; <em>CHI EA &rsquo;19: Extended Abstracts of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number alt06.<br>doi: <a href="https://doi.org/10.1145/3290607.3310433" target="_blank">https://doi.org/10.1145/3290607.3310433</a>, accessed 5 September 2020.</p>
+
+<p>Jeffrey V. Lazarus, Kelly Safreed-Harmon, Simon E. Barton, Dominique Costagliola, Nikos Dedes, Julia del Amo Valero, Jose M. Gatell, Ricardo Baptista-Leite, Lus Mend&atilde;o, Kholoud Porter, Stefano Vella, and J&uuml;rgen Kurt Rockstroh, 2016. &ldquo;Beyond viral suppression of HIV &mdash; The new quality of life frontier,&rdquo; <em>BMC Medicine</em>, volume 14, number 1, article number 94.<br>doi: <a href="https://doi.org/10.1186/s12916-016-0640-4" target="_blank">https://doi.org/10.1186/s12916-016-0640-4</a>, accessed 5 September 2020.</p>
+
+<p>J. Stan Lehman, Meredith H. Carr, Allison J. Nichol, Alberto Ruisanchez, David W. Knight, Anne E. Langford, Simone C. Gray, and Jonathan H. Mermin, 2014. &ldquo;Prevalence and public health implications of state laws that criminalize potential HIV exposure in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp.997&ndash;1,006.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0724-0" target="_blank">https://doi.org/10.1007/s10461-014-0724-0</a>, accessed 5 September 2020.</p>
+
+<p>Karen Levy and Solon Barocas, 2018. &ldquo;Designing against discrimination in online markets,&rdquo; <em>Berkeley Technology Law Journal</em>, volume 32, number 3, pp. 1,183&ndash;1,237.<br>doi: <a href="https://doi.org/10.15779/Z38BV79V7K" target="_blank">https://doi.org/10.15779/Z38BV79V7K</a>, accessed 5 September 2020.</p>
+
+<p>Eric Lichtblau and William M. Arkin, 2014. &ldquo;More federal agencies are using undercover operations,&rdquo; <em>New York Times</em> (15 November), at <a href="https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html" target="_blank">https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html</a>, accessed 5 September 2020.</p>
+
+<p>Ann Light, 2011. &ldquo;HCI as heterodoxy: Technologies of identity and the queering of interaction with computers,&rdquo; <em>Interacting with Computers</em>, volume 23, number 5, pp. 430&ndash;438.<br>doi: <a href="https://doi.org/10.1016/j.intcom.2011.02.002" target="_blank">https://doi.org/10.1016/j.intcom.2011.02.002</a>, accessed 5 September 2020.</p>
+
+<p>Ben Light, Jean Burgess, and Stefanie Duguay, 2018. &ldquo;The walkthrough method: An approach to the study of apps,&rdquo; <em>New Media &amp; Society</em>, volume 20, number 3, pp. 881&ndash;900.<br>doi: <a href="https://doi.org/10.1177/1461444816675438" target="_blank">https://doi.org/10.1177/1461444816675438</a>, accessed 5 September 2020.</p>
+
+<p>Anish P. Mahajan, Jennifer N. Sayles, Vishal A. Patel, Robert H. Remien, Daniel Ortiz, Greg Szekeres, and Thomas J. Coates, 2008. &ldquo;Stigma in the HIV/AIDS epidemic: A review of the literature and recommendations for the way forward,&rdquo; <em>AIDS</em>, volume 22, supplement 2, pp. S67&ndash;S79.<br>doi: <a href="https://doi.org/10.1097/01.aids.0000327438.13291.62" target="_blank">https://doi.org/10.1097/01.aids.0000327438.13291.62</a>, accessed 5 September 2020.</p>
+
+<p>Alexandra McCallum, 2014. &ldquo;Criminalizing the transmission of HIV: Consent, disclosure, and online dating,&rdquo; <em>Utah Law Review</em>, volume 2014, number 3, article 5, at <a href="https://dc.law.utah.edu/ulr/vol2014/iss3/5" target="_blank">https://dc.law.utah.edu/ulr/vol2014/iss3/5</a>, accessed 5 September 2020.</p>
+
+<p>Donna Hubbard McCree and Matthew Hogben, 2010. &ldquo;The contribution to and context of other sexually transmitted diseases and tuberculosis in the HIV/AIDS epidemic among African Americans,&rdquo; In: Donna Hubbard McCree, Kenneth Jones, and Ann O&rsquo;Leary (editors). <em>African Americans and HIV/AIDS: Understanding and addressing the epidemic</em>, New York: Springer, pp. 3&ndash;12.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>William C. Miller, Carol A. Ford, Martina Morris, Mark S. Handcock, John L. Schmitz, Marcia M. Hobbs, Myron S. Cohen, Kathleen Mullan Harris, and J. Richard Udry, 2004. &ldquo;Prevalence of chlamydial and gonococcal infections among young adults in the United States,&rdquo; <em>Journal of the American Medical Association</em>, volume 291, number 18, pp. 2,229&ndash;2,236.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>Viviane Namaste, 2015. <em>Oversight: Critical reflections on feminist research and politics</em>. Toronto: Women&rsquo;s Press.</p>
+
+<p>Angela Perone, 2013. &ldquo;From punitive to proactive: An alternative approach for responding to HIV criminalization that departs from penalizing marginalized communities,&rdquo; <em>Hastings Women&rsquo;s Law Journal</em>, volume 24, pp. 363&ndash;406, and at <a href="https://repository.uchastings.edu/hwlj/vol24/iss2/5" target="_blank">https://repository.uchastings.edu/hwlj/vol24/iss2/5</a>, accessed 5 September 2020.</p>
+
+<p>Deana A. Pollard, 2006. &ldquo;Sex torts,&rdquo; <em>Minnesota Law Review</em>, volume 91, pp. 769&ndash;824, and at <a href="https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf" target="_blank">https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf</a>, accessed 5 September 2020.</p>
+
+<p>POZ, 2015. &ldquo;Man with HIV arrested for seeking sex on social media&rdquo;(22 July 22), at <a href="https://www.poz.com/article/stlouis-hiv-arrest-27534-4846" target="_blank">https://www.poz.com/article/stlouis-hiv-arrest-27534-4846</a>, accessed 5 September 2020.</p>
+
+<p>Russell K. Robinson, 2007. &ldquo;Structural dimensions of romantic preferences,&rdquo; <em>Fordham Law Review</em>, volume 76, pp. 2,787&ndash;2,820, and at <a href="http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/" target="_blank">http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/</a>, accessed 5 September 2020.</p>
+
+<p>Michael J. Rosenfeld and Reuben J. Thomas, 2012. &ldquo;Searching for a mate: The rise of the Internet as a social intermediary,&rdquo; <em>American Sociological Review</em>, volume 77, number 4, pp. 523&ndash;547.<br>doi: <a href="https://doi.org/10.1177/0003122412448050" target="_blank">https://doi.org/10.1177/0003122412448050</a>, accessed 5 September 2020.</p>
+
+<p>B.R. Simon Rosser, J. Michael Wilkerson, Derek J. Smolenski, J. Michael Oakes, Joseph Konstan, Keith J. Horvath, Gunna R. Kilian, David S. Novak, Gene P. Danilenko, and Richard Morgan, 2011. &ldquo;The future of Internet-based HIV prevention: A report on key findings from the Men&rsquo;s INTernet (MINTS-I, II) Sex Studies,&rdquo; <em>AIDS and Behavior</em>, volume 15, supplement 1, pp. S91&ndash;S100.<br>doi: <a href="https://doi.org/10.1007/s10461-011-9910-5" target="_blank">https://doi.org/10.1007/s10461-011-9910-5</a>, accessed 5 September 2020.</p>
+
+<p>Brian Schram, 2019. &ldquo;Accidental orientations: Rethinking queerness in archival times,&rdquo; <em>Surveillance &amp; Society</em>, volume 17, number 5, pp. 602&ndash;617.<br>doi: <a href="https://doi.org/10.24908/ss.v17i5.8688" target="_blank">https://doi.org/10.24908/ss.v17i5.8688</a>, accessed 5 September 2020.</p>
+
+<p>Junichi P. Semitsu, 2011. &ldquo;From Facebook to mug shot: How the dearth of social networking privacy rights revolutionized online government surveillance,&rdquo; <em>Pace Law Review</em>, volume 31, number 1, pp. 291&ndash;381, and at <a href="https://digitalcommons.pace.edu/plr/vol31/iss1/7" target="_blank">https://digitalcommons.pace.edu/plr/vol31/iss1/7</a>, accessed 5 September 2020.</p>
+
+<p>Sero Project, 2012, &ldquo;National criminalization survey preliminary results,&rdquo; (25 July), at <a href="https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/" target="_blank">https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/</a>, accessed 30 August 2019.</p>
+
+<p>Julianne M. Serovich and Katie E. Mosack, 2003. &ldquo;Reasons for HIV disclosure or nondisclosure to casual sexual partners,&rdquo; <em>AIDS Education and Prevention</em>, volume 15, number 1, pp. 70&ndash;80.</p>
+
+<p>Natasha Singer, 2018. &ldquo;Grindr sets off privacy firestorm after sharing users&rsquo; H.I.V.-status data,&rdquo; <em>New York Times</em> (3 April), at <a href="https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html" target="_blank">https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html</a>, accessed 5 September 2020.</p>
+
+<p>Lucy Suchman, 2011. &ldquo;Anthropological relocations and the limits of design,&rdquo; <em>Annual Review of Anthropology</em>, volume 40, pp. 1&ndash;18.<br>doi: <a href="https://doi.org/10.1146/annurev.anthro.041608.105640" target="_blank">https://doi.org/10.1146/annurev.anthro.041608.105640</a>, accessed 5 September 2020.</p>
+
+<p>Cass R. Sunstein, 1996. &ldquo;Social norms and social roles,&rdquo; <em>Columbia Law Review</em>, volume 96, number 4, pp. 903&ndash;968.</p>
+
+<p>Patricia Sweeney, Simone C. Gray, David W. Purcell, Jenny Sewell, Aruna Surendera Babu, Brett A. Tarver, Joseph Prejean, and Jonathan Mermin, 2017. &ldquo;Association of HIV diagnosis rates and laws criminalizing HIV exposure in the United States,&rdquo; <em>AIDS</em>, volume 31, number 10, pp. 1,483&ndash;1,488.<br>doi: <a href="https://doi.org/10.1097/QAD.0000000000001501" target="_blank">https://doi.org/10.1097/QAD.0000000000001501</a>, accessed 5 September 2020.</p>
+
+<p>Bryan L. Sykes, Trevor A. Hoppe, and Kristen D. Maziarka, 2016. &ldquo;Cruel intentions? HIV prevalence and criminalization during an age of mass incarceration, U.S. 1999 to 2012,&rdquo; <em>Medicine (Baltimore)</em>, volume 95, number 16, e3352.<br>doi: <a href="https://doi.org/10.1097/MD.0000000000003352" target="_blank">https://doi.org/10.1097/MD.0000000000003352</a>, accessed 5 September 2020.</p>
+
+<p>Samuel Hardman Taylor, Jevan Alexander Hutson, and Tyler Richard Alicea, 2017. &ldquo;Social consequences of Grindr use: Extending the Internet-enhanced self-disclosure hypothesis,&rdquo; <em>CHI &rsquo;17: Proceedings of the 2017 CHI Conference on Human Factors in Computing Systems</em>, pp. 6,645&ndash;6,657.<br>doi: <a href="https://doi.org/10.1145/3025453.3025775" target="_blank">https://doi.org/10.1145/3025453.3025775</a>, accessed 5 September 2020.</p>
+
+<p>Steven Thrasher, 2015. &ldquo;A Black body on trial: The conviction of HIV-positive &lsquo;Tiger Mandingo&rsquo;,&rdquo; <em>BuzzFeed News</em> (30 November), at <a href="https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m" target="_blank">https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m</a>, accessed 5 September 2020.</p>
+
+<p>Liming Wang, Dylan Podson, Zihuang Chen, Hongyan Lu, Vania Wang, Colin Shepard, John K. Williams, and Guodong Mi, 2019. &ldquo;Using social media to increase HIV testing among men who have sex with men &mdash; Beijing, China, 2013&ndash;2017,&rdquo; <em>Morbidity and Mortality Weekly Report</em>, volume 68, number 21, pp. 478&ndash;482.<br>doi: <a href="http://dx.doi.org/10.15585/mmwr.mm6821a3" target="_blank">http://dx.doi.org/10.15585/mmwr.mm6821a3</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward. 2005. &ldquo;Partner notification and contact-tracing,&rdquo; <em>Medicine</em>, volume 33, number 9, pp. 28&ndash;30.<br>doi: <a href="https://doi.org/10.1383/medc.2005.33.9.28" target="_blank">https://doi.org/10.1383/medc.2005.33.9.28</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward and Gill Bell, 2014. &ldquo;Partner notification,&rdquo; <em>Medicine (Abingdon)</em>, volume 42, number 6, pp. 314&ndash;317.<br>doi: <a href="https://doi.org/10.1016/j.mpmed.2014.03.013" target="_blank">https://doi.org/10.1016/j.mpmed.2014.03.013</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Andreas Gutmann, M. Angela Sasse, and Ann Blandford, 2018. &ldquo;Privacy unraveling around explicit HIV status disclosure fields in the online geosocial hookup app Grindr,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 181.<br>doi: <a href="https://doi.org/10.1145/3274450" target="_blank">https://doi.org/10.1145/3274450</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Juan F. Maestre, Jo Gibbs, Chia-Fang Chung, and Ann Blandford, 2019. &ldquo;Signal appropriation of explicit HIV status disclosure fields in sex-social apps used by gay and bisexual men,&rdquo; <em>CHI &rsquo;19: Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number 692.<br>doi: <a href="https://doi.org/10.1145/3290605.3300922" target="_blank">https://doi.org/10.1145/3290605.3300922</a>, accessed 5 September 2020.</p>
+
+<p>Dylan Eric Wittkower, 2016. &ldquo;Lurkers, creepers, and virtuous interactivity: From property rights to consent to care as a conceptual basis for privacy concerns and information ethics,&rdquo; <em>First Monday</em>, volume 21, number 10, at <a href="https://firstmonday.org/article/view/6948/5628" target="_blank">https://firstmonday.org/article/view/6948/5628</a>, accessed 5 September 2020.<br>doi: <a href="https://doi.org/10.5210/fm.v21i10.6948" target="_blank">https://doi.org/10.5210/fm.v21i10.6948</a>, accessed 5 September 2020.</p>
+
+<p>Dan Wohlfeiler, Jennifer Hecht, Jonathan Volk, H. Fisher Raymond, Tom Kennedy, and Willi McFarland, 2013. &ldquo;How can we improve online HIV and STD prevention for men who have sex with men? Perspectives of hook-up website owners, website users, and HIV/STD directors,&rdquo; <em>AIDS and Behavior</em>, volume 17, number 9, pp. 3,024&ndash;3,033.<br>doi: <a href="https://doi.org/10.1007/s10461-012-0375-y" target="_blank">https://doi.org/10.1007/s10461-012-0375-y</a>, accessed 5 September 2020.</p>
+
+<p>Mara Cecilia Zea, Carol A. Reisen, Paul J. Poppen, and Rafael M. Daz. 2003. &ldquo;Asking and telling: communication about HIV status among Latino HIV-positive gay men,&rdquo; <em>AIDS and Behavior</em>, volume 7, number 2, pp. 143&ndash;152.<br>doi: <a href="https://doi.org/10.1023/A:1023994207984" target="_blank">https://doi.org/10.1023/A:1023994207984</a>, accessed 5 September 2020.</p>
+
+<p>Shoshana Zuboff, 2019. <em>The age of surveillance capitalism: The fight for a human future at the new frontier of power</em>. London: Profile Books.</p>
+
+<p>&nbsp;</p>
+<hr width="300">
+
+<p><strong>Editorial history</strong></p>
+<p>Received 17 October 2019; revised 12 February 2020; accepted 28 August 2020.</p>
+
+<hr>
+
+<p><a href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" src="https://i.creativecommons.org/l/by/4.0/80x15.png"></a><br>This paper is licensed under a <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</p>
+
+<p>Surveillance, stigma &amp; sociotechnical design for HIV<br>by Calvin Liang, Jevan Alexander Hutson, and Os Keyes.<br><em>First Monday</em>, Volume 25, Number 10 - 5 October 2020<br>https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729<br>doi: <a href="http://dx.doi.org/10.5210/fm.v25i10.10274" target="_blank">http://dx.doi.org/10.5210/fm.v25i10.10274</a></p>
+</blockquote>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_landingpage.html b/python/tests/files/first_monday_ojs3_landingpage.html
new file mode 100644
index 0000000..2633256
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_landingpage.html
@@ -0,0 +1,616 @@
+ <!DOCTYPE html>
+<html lang="en-US" xml:lang="en-US">
+<head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ | First Monday
+ </title>
+
+
+<meta name="generator" content="Open Journal Systems 3.1.2.0">
+<link rel="icon" href="https://firstmonday.org/ojs/public/journals/3/favicon_en_US.gif">
+<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+<meta name="DC.Coverage" xml:lang="en" content=""/>
+<meta name="DC.Creator.PersonalName" content="Calvin Liang"/>
+<meta name="DC.Creator.PersonalName" content="Jevan Alexander Hutson"/>
+<meta name="DC.Creator.PersonalName" content="Os Keyes"/>
+<meta name="DC.Date.created" scheme="ISO8601" content="2020-09-10"/>
+<meta name="DC.Date.dateSubmitted" scheme="ISO8601" content="2019-09-15"/>
+<meta name="DC.Date.issued" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Date.modified" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Description" xml:lang="en" content="Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."/>
+<meta name="DC.Format" scheme="IMT" content="text/html"/>
+<meta name="DC.Identifier" content="10274"/>
+<meta name="DC.Identifier.DOI" content="10.5210/fm.v25i10.10274"/>
+<meta name="DC.Identifier.URI" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="DC.Language" scheme="ISO639-1" content="en"/>
+<meta name="DC.Rights" content="Copyright (c) 2020 First Monday"/>
+<meta name="DC.Rights" content=""/>
+<meta name="DC.Source" content="First Monday"/>
+<meta name="DC.Source.ISSN" content="1396-0466"/>
+<meta name="DC.Source.URI" content="https://firstmonday.org/ojs/index.php/fm"/>
+<meta name="DC.Subject" xml:lang="en" content="HIV"/>
+<meta name="DC.Subject" xml:lang="en" content="online dating"/>
+<meta name="DC.Subject" xml:lang="en" content="design"/>
+<meta name="DC.Subject" xml:lang="en" content="policy"/>
+<meta name="DC.Subject" xml:lang="en" content="surveillance"/>
+<meta name="DC.Subject" xml:lang="en" content="intimacy"/>
+<meta name="DC.Subject" xml:lang="en" content="social computing"/>
+<meta name="DC.Subject" xml:lang="en" content="social justice"/>
+<meta name="DC.Title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="DC.Type" content="Text.Serial.Journal"/>
+<meta name="DC.Type" xml:lang="en" content="Qualitative; Content analysis"/>
+<meta name="DC.Type.articleType" content="Articles"/>
+<meta name="gs_meta_revision" content="1.1"/>
+<meta name="citation_journal_title" content="First Monday"/>
+<meta name="citation_journal_abbrev" content="1"/>
+<meta name="citation_issn" content="1396-0466"/>
+<meta name="citation_author" content="Calvin Liang"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_author" content="Jevan Alexander Hutson"/>
+<meta name="citation_author_institution" content="University of Washington, School of Law"/>
+<meta name="citation_author" content="Os Keyes"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="citation_date" content="2020/09/10"/>
+<meta name="citation_doi" content="10.5210/fm.v25i10.10274"/>
+<meta name="citation_abstract_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="citation_language" content="en"/>
+<meta name="citation_keywords" xml:lang="en" content="HIV"/>
+<meta name="citation_keywords" xml:lang="en" content="online dating"/>
+<meta name="citation_keywords" xml:lang="en" content="design"/>
+<meta name="citation_keywords" xml:lang="en" content="policy"/>
+<meta name="citation_keywords" xml:lang="en" content="surveillance"/>
+<meta name="citation_keywords" xml:lang="en" content="intimacy"/>
+<meta name="citation_keywords" xml:lang="en" content="social computing"/>
+<meta name="citation_keywords" xml:lang="en" content="social justice"/>
+<meta name="citation_fulltext_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"/>
+<link rel="alternate" type="application/atom+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+<link rel="alternate" type="application/rdf+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+<link rel="alternate" type="application/rss+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <link rel="stylesheet" href="https://firstmonday.org/ojs/index.php/fm/$$$call$$$/page/page/css?name=stylesheet" type="text/css" /><link rel="stylesheet" href="//fonts.googleapis.com/css?family=Noto+Sans:400,400italic,700,700italic" type="text/css" /><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.css" type="text/css" /><link rel="stylesheet" href="https://firstmonday.org/ojs/public/journals/3/styleSheet.css" type="text/css" />
+</head>
+<body class="pkp_page_article pkp_op_view has_site_logo" dir="ltr">
+
+ <div class="cmp_skip_to_content">
+ <a href="#pkp_content_main">Skip to main content</a>
+ <a href="#pkp_content_nav">Skip to main navigation menu</a>
+ <a href="#pkp_content_footer">Skip to site footer</a>
+ </div>
+ <div class="pkp_structure_page">
+
+ <header class="pkp_structure_head" id="headerNavigationContainer" role="banner">
+ <div class="pkp_head_wrapper">
+
+ <div class="pkp_site_name_wrapper">
+ <div class="pkp_site_name">
+ <a href=" https://firstmonday.org/ojs/index.php/fm/index
+ " class="is_img">
+ <img src="https://firstmonday.org/ojs/public/journals/3/pageHeaderLogoImage_en_US.gif" width="252" height="102" alt="Page Header Logo" />
+ </a>
+ </div>
+ </div>
+
+
+ <nav class="pkp_navigation_primary_row" aria-label="Site Navigation">
+ <div class="pkp_navigation_primary_wrapper">
+ <ul id="navigationPrimary" class="pkp_navigation_primary pkp_nav_list">
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About
+ </a>
+ <ul>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About the Journal
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/editorialTeam">
+ Editorial Team
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/privacy">
+ Privacy Statement
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/contact">
+ Contact
+ </a>
+ </li>
+ </ul>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search">
+ Search
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/current">
+ Current
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/announcement">
+ Announcements
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/submissions">
+ Submissions
+ </a>
+ </li>
+ </ul>
+
+
+
+ <form class="pkp_search" action="https://firstmonday.org/ojs/index.php/fm/search/search" method="post" role="search">
+ <input type="hidden" name="csrfToken" value="671acac3a608346eb0eb4de1f26c7563">
+ <input name="query" value="" type="text" aria-label="Search Query">
+ <button type="submit">
+ Search
+ </button>
+ <div class="search_controls" aria-hidden="true">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search" class="headerSearchPrompt search_prompt" aria-hidden="true">
+ Search
+ </a>
+ <a href="#" class="search_cancel headerSearchCancel" aria-hidden="true"></a>
+ <span class="search_loading" aria-hidden="true"></span>
+ </div>
+</form>
+ </div>
+ </nav>
+
+ <nav class="pkp_navigation_user_wrapper" id="navigationUserWrapper" aria-label="User Navigation">
+ <ul id="navigationUser" class="pkp_navigation_user pkp_nav_list">
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/user/register">
+ Register
+ </a>
+ </li>
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/login">
+ Login
+ </a>
+ </li>
+ </ul>
+
+ </nav>
+ </div><!-- .pkp_head_wrapper -->
+ </header><!-- .pkp_structure_head -->
+
+ <div class="pkp_structure_content has_sidebar">
+ <div id="pkp_content_main" class="pkp_structure_main" role="main">
+
+<div class="page page_article">
+ <nav class="cmp_breadcrumbs" role="navigation" aria-label="You are here:">
+ <ol>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/index">
+ Home
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li class="current">
+ Articles
+ </li>
+ </ol>
+</nav>
+
+ <article class="obj_article_details">
+ <h1 class="page_title">
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ </h1>
+
+
+ <div class="row">
+ <div class="main_entry">
+
+ <ul class="item authors">
+ <li>
+ <span class="name">
+ Calvin Liang
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0002-3795-3441" target="_blank">
+ https://orcid.org/0000-0002-3795-3441
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Jevan Alexander Hutson
+ </span>
+ <span class="affiliation">
+ University of Washington, School of Law
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0003-3312-1733" target="_blank">
+ https://orcid.org/0000-0003-3312-1733
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Os Keyes
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0001-5196-609X" target="_blank">
+ https://orcid.org/0000-0001-5196-609X
+ </a>
+ </span>
+ </li>
+ </ul>
+
+ <div class="item doi">
+ <span class="label">
+ DOI:
+ </span>
+ <span class="value">
+ <a href="https://doi.org/10.5210/fm.v25i10.10274">
+ https://doi.org/10.5210/fm.v25i10.10274
+ </a>
+ </span>
+ </div>
+
+ <div class="item keywords">
+ <span class="label">
+ Keywords:
+ </span>
+ <span class="value">
+ HIV, online dating, design, policy, surveillance, intimacy, social computing, social justice </span>
+ </div>
+
+ <div class="item abstract">
+ <h3 class="label">Abstract</h3>
+ <p>Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+ </div>
+
+
+
+ <div class="item author_bios">
+ <h3 class="label">
+ Author Biographies
+ </h3>
+ <div class="sub_item">
+ <div class="label">
+ Calvin Liang, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ <p>Calvin Liang is a PhD student in Human-Centered Design and Engineering at The University of Washington. Their research broadly focuses on technology’s role in and out of queerness, health, and queer health.</p>
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Jevan Alexander Hutson, <span class="affiliation">University of Washington, School of Law</span>
+ </div>
+ <div class="value">
+ Jevan Hutson is a third-year law student and Gregoire Fellow at the University of Washington School of Law. He holds an M.P.S. from the Department of Information Science at Cornell University, and a B.A. from the Department of Art History and Visual Studies at Cornell University. He has been published in venues including the Association for Computing Machinery’s conferences on Computer Human Interaction and Computer Supported Cooperative Work and Social Computing
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Os Keyes, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ Os Keyes is a PhD student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.
+ </div>
+ </div>
+ </div>
+
+
+ </div><!-- .main_entry -->
+
+ <div class="entry_details">
+
+ <div class="item cover_image">
+ <div class="sub_item">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ <img src="https://firstmonday.org/ojs/public/journals/3/cover_issue_678_en_US.png" alt="“Frank Moore, Digital Divide, 2001 gouache, oil and mixed media on paper 14 3/4 x 24 1/4 inches (36,4 x 61,6 cm) sheetâ€">
+ </a>
+ </div>
+ </div>
+
+ <div class="item galleys">
+ <ul class="value galleys_links">
+ <li>
+
+
+
+
+<a class="obj_galley_link file" href="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729">
+
+
+ HTML
+
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="item published">
+ <div class="label">
+ Published
+ </div>
+ <div class="value">
+ 2020-09-10
+ </div>
+ </div>
+
+ <div class="item citation">
+ <div class="sub_item citation_display">
+ <div class="label">
+ How to Cite
+ </div>
+ <div class="value">
+ <div id="citationOutput" role="region" aria-live="polite">
+ <div class="csl-bib-body">
+ <div class="csl-entry">Liang, C., Hutson, J. A., &#38; Keyes, O. (2020). Surveillance, stigma &amp; sociotechnical design for HIV. <i>First Monday</i>, <i>25</i>(10). https://doi.org/10.5210/fm.v25i10.10274</div>
+</div>
+ </div>
+ <div class="citation_formats">
+ <button class="cmp_button citation_formats_button" aria-controls="cslCitationFormats" aria-expanded="false" data-csl-dropdown="true">
+ More Citation Formats
+ </button>
+ <div id="cslCitationFormats" class="citation_formats_list" aria-hidden="true">
+ <ul class="citation_formats_styles">
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274&amp;return=json"
+ >
+ ACM
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274&amp;return=json"
+ >
+ ACS
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274&amp;return=json"
+ >
+ APA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274&amp;return=json"
+ >
+ ABNT
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274&amp;return=json"
+ >
+ Chicago
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274&amp;return=json"
+ >
+ Harvard
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274&amp;return=json"
+ >
+ IEEE
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274&amp;return=json"
+ >
+ MLA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274&amp;return=json"
+ >
+ Turabian
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274&amp;return=json"
+ >
+ Vancouver
+ </a>
+ </li>
+ </ul>
+ <div class="label">
+ Download Citation
+ </div>
+ <ul class="citation_formats_styles">
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/ris?submissionId=10274">
+ <span class="fa fa-download"></span>
+ Endnote/Zotero/Mendeley (RIS)
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/bibtex?submissionId=10274">
+ <span class="fa fa-download"></span>
+ BibTeX
+ </a>
+ </li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="item issue">
+ <div class="sub_item">
+ <div class="label">
+ Issue
+ </div>
+ <div class="value">
+ <a class="title" href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ </div>
+ </div>
+
+ <div class="sub_item">
+ <div class="label">
+ Section
+ </div>
+ <div class="value">
+ Articles
+ </div>
+ </div>
+ </div>
+
+
+ <div class="item copyright">
+ <p>Authors retain copyright to their work published in <em>First Monday</em>. Please see the footer of each article for details.</p>
+ </div>
+
+
+
+ </div><!-- .entry_details -->
+ </div><!-- .row -->
+
+</article>
+
+
+
+</div><!-- .page -->
+
+ </div><!-- pkp_structure_main -->
+
+ <div class="pkp_structure_sidebar left" role="complementary" aria-label="Sidebar">
+ <div class="pkp_block block_developed_by">
+ <div class="content">
+ <a href="http://pkp.sfu.ca/ojs/">
+ Open Journal Systems
+ </a>
+ </div>
+</div>
+<div class="pkp_block block_web_feed">
+ <span class="title">Current Issue</span>
+ <div class="content">
+ <ul>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/atom.svg" alt="Atom logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss20_logo.svg" alt="RSS2 logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss10_logo.svg" alt="RSS1 logo">
+ </a>
+ </li>
+ </ul>
+ </div>
+</div>
+
+ </div><!-- pkp_sidebar.left -->
+ </div><!-- pkp_structure_content -->
+
+<div id="pkp_content_footer" class="pkp_structure_footer_wrapper" role="contentinfo">
+
+ <div class="pkp_structure_footer">
+
+ <div class="pkp_footer_content">
+ <p>A Great Cities Initiative of the University of Illinois at Chicago&nbsp;<a href="http://library.uic.edu/">University Library</a>.</p>
+<p>©&nbsp;<em>First Monday</em>, 1995-2020. ISSN&nbsp;1396-0466.</p>
+ </div>
+
+ <div class="pkp_brand_footer" role="complementary">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/aboutThisPublishingSystem">
+ <img alt="About this Publishing System" src="https://firstmonday.org/ojs/templates/images/ojs_brand.png">
+ </a>
+ </div>
+ </div>
+</div><!-- pkp_structure_footer_wrapper -->
+
+</div><!-- pkp_structure_page -->
+
+<script src="//ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js" type="text/javascript"></script><script src="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/jquery-ui.min.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/lib/pkp/js/lib/jquery/plugins/jquery.tag-it.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/popper/popper.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/util.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/dropdown.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/main.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/generic/citationStyleLanguage/js/articleCitation.js" type="text/javascript"></script><script type="text/javascript">
+(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+ga('create', 'UA-41314203-1', 'auto');
+ga('send', 'pageview');
+</script>
+
+
+</body>
+</html>
diff --git a/python/tests/files/genders_g58_fairlie.html b/python/tests/files/genders_g58_fairlie.html
new file mode 100644
index 0000000..49cada8
--- /dev/null
+++ b/python/tests/files/genders_g58_fairlie.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+<title>Genders OnLine Journal - Genders OnLine Journal - Presenting innovative theories in art, literature, history, music, TV and film.</title>
+<meta name="description" content="Analysis of Hitchcock’s Rope (1948) as a critique of heteromasculinity that thematizes queer anguish, orality, and women’s relationship to the covert world of homosexual knowledge.">
+<meta name="keywords" content="homosexuality, homophobia, Cold War, the closet, heteromasculinity, queer anguish, anus, suspicion, orality, eating, cannibalism, Catholicism, knowledge, the cinematic cut, cinematic reality, women in Hitchcock, women and gay men, lack, hypocrisy, straight male interlocutor.">
+<style type="text/css">
+<!--
+
+td {
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 13px;
+}
+
+.Section1 {
+ page:Section1;
+}
+-->
+</style>
+</head>
+<body alink="#000088" background="../image/back.jpg" vlink="#00aa00">
+<p>
+<table width="600">
+ <tbody>
+ <tr>
+ <td valign="top" width="90"><p><img src="../image/indlgo.gif" alt="Genders OnLine Journal" align="bottom" border="0" height="530" width="97"> </p></td>
+ <td align="right" valign="top" width="530"><table width="530">
+ <tbody>
+ <tr>
+ <td valign="top"><p><b><font size="2">Issue 58</font></b>, Fall 2013</p>
+ <p><font size="5"><strong>Reading Maeshowe</strong></font> <br>
+ Recovering the Feminine in a Neolithic Tomb</p>
+<p>By <strong>CHARLOTTE FAIRLIE</strong></p>
+ <p>[1] Cuween, a small Neolithic cairn, perches on top of a hill on the Orkney Mainland. A flashlight waits in a bucket by the door, and visitors crawl on hands and knees, one by one, into the pitch-black interior. After savoring a degree of darkness rare in modern life, they direct beams of light up the tapering walls to marvel at the skill of the stonemasons. It is impossible to resist the impulse to clamber into the chambers and crouch where the bones once lay. Green and smooth, Maeshowe, another Orkney cairn, rises enigmatically from the field where it has stood since around 2700 BC. The designation of this monument and the surrounding Neolithic structures as a UNESCO World Heritage Site (WHS) in 1999 significantly increased tourism to the area (Card et al. 429), so while visitors may still enter Cuween unsupervised, access to the much larger Maeshowe now requires a timed ticket, bought in advance. Throughout the year, thousands of visitors, bending uncomfortably low, shuffle through the tunnel-like passage entry, making the physical journey from light to dark and a more psychological journey from present to past. Exploring any of the Neolithic sites in Orkney is to bridge time, to feel kinship with those who built them.</p>
+ <p>[2] Without doubt, a major reason Maeshowe attracts so many people is its symbiotic relationship with its environment. Most famously, at sundown during the December solstice, the winter sun lines up with the door of the tomb, shines down the passage, and focuses its rays on the stone wall within. Interest in this phenomenon, the moment when the light stabs the darkness, is so high that Historic Scotland provides web-cam coverage, but Maeshowe fascinates others besides tourists and solstice celebrants. Whether they are vacation visitors, archaeologists, anthropologists, or poets, explorers experience the sites differently, applying their own intellectual tools and imagining Neolithic lives from their respective points of view. Leslie Riddoch has written that these are &ldquo;Stone Age marvels which inspire and astonish,&rdquo; and Simon W. Hall expresses the experiences of many when he refers to &ldquo;the profound impact of entering a tomb&rdquo; (160). They imply that to enter a cairn is to become one with it, to undergo a transformation. Maeshowe, which can now be experienced only under the regimented conditions required by the Historic Scotland guides, clearly retains extraordinary power to inspire. Indeed, this ancient mound has attracted a great deal of literary attention from both noted and obscure writers. Considering these cumulative interpretations, rather than relying solely on the work of archaeologists, opens up a more comprehensive, textured, and, indeed, gendered understanding of ancient history and our commonality with Neolithic peoples.</p>
+ <p> [3] George Mackay Brown, Kathleen Jamie, Myra Schneider, and Dilys Rose are four of the more prominent authors for whom Maeshowe has proven inspirational. They have experienced the tomb through a doubly imaginative process: first by reading it as they would read a poem and then by expressing that interpretation in writing. While Brown was an Orcadian, living most of his life alongside the Neolithic sites, Jamie, Schneider, and Rose, all of whom have Scottish roots, experience Maeshowe as tourists, drawn across the Pentland Firth to enter the passage and travel into the darkness. Significantly, all three of these more contemporary writers are women. Hall, in his valuable survey, <u>The History of Orkney Literature</u>, contrasts the use of the prehistoric by female Scottish writers with that of their male counterparts, stating that it is less political, that women authors take &ldquo;the opportunity to reestablish the place&mdash;and, significantly, the inner lives of women in the prehistoric or early historical northern landscape&rdquo; (162-163). I would argue, however, that their work also engages the public world to a greater extent and is more ideological than this statement implies. Jamie&rsquo;s, Schneider&rsquo;s, and Rose&rsquo;s experiences in Maeshowe lead to readings of the monument that build on the archaeological interpretations, allowing us to consider the possibility of ancient gender power struggles and raising our awareness of the deep roots of masculine dominance.</p>
+ <p>[4] Archaeologist Colin Richards, who has written extensively about The Heart of Neolithic Orkney WHS, describes how visiting cairns must also have affected prehistoric visitors: &ldquo;the journey will be one of consequence.&rdquo; Moving from the light of day to the dark mysteries of a tomb&rsquo;s interior &ldquo;is a passage from the profane to the sacred.&rdquo; As such, &ldquo;it will involve transformation&rdquo; (&ldquo;Doorways&rdquo; 70-71). However, the nature of the transformation is mysterious. Referring to single-chambered structures divided into stalls, he continues, &ldquo;If the Orkney-Cromarty &lsquo;chambered&rsquo; tombs are principally conceived as a series of doorways, the question arises: where are they leading? To what goal?&rdquo; (71). In discussing the relationship between buildings and the people who used them thousands of years ago, Richards considers the figurative significance of doors. In doing so, he treats the tombs as if they were literary texts with debatable meaning, having previously pointed out that &ldquo;the architecture of a chambered tomb relied on analogy and metaphor for its understanding and interpretation&rdquo; (&ldquo;Doorways&rdquo; 67). Rather than merely being repositories for bones, the tombs, Richards asserts, were &ldquo;built to be experienced visually, physically and imaginatively,&rdquo; an experience which may well result in some kind of &ldquo;revelation&rdquo; (&ldquo;Doorways.&rdquo; 69, 70, 76). Since he argues that buildings carry metaphoric meaning, open to imaginative interpretation, it is entirely appropriate that, when explaining this, Richards also changes to the historical present tense. His grammatical shift emphasizes that like <u>Beowulf</u>, <u>Hamlet</u>, or <u>Moby Dick</u>, tombs such as Maeshowe transcend time and are open to new readings, whether by trained archaeologists, pilgrims, casual visitors, or writers.</p>
+ <p>[5] Robert Crawford draws more explicit parallels between Maeshowe itself and literature in his essay, &ldquo;Maes Howe Sappho.&rdquo; Noting the continuing appeal of the tomb, how today &ldquo;people still treasure&rdquo; the moment that the sun lines up with the passage, he compares the ancient monument to poetry:</p><blockquote>However different we and our family groups, our tribes, have become, we can and do still savor that sense of alignment and attunement and have our own ways of articulating some sort of consonance between ourselves, our intimate groupings, and the universe that surrounds us. Though such patternings may be deconstructed, they seem to emerge from a deep need that recurs across generations, like a persistent internal rhyme, and poetry, this most nuanced way of making with words, is a way in which that need for attunement is repeatedly articulated through language. If prehistoric sites often appear to relate people to the stars and planets, then poems continue that impulse. (61)
+ </blockquote>
+ <p>Ancient tombs, then, prompt us to ponder our place in the universe, our identity as humans, and in that also they resemble literature. According to Kenneth Brophy, Neolithic monuments &ldquo;were and are locations that embodied the biography of the builders, users, spectators, and excavators&rdquo; (10). It follows that if we think of Maeshowe as a text, Brophy&rsquo;s assertion that the monument absorbs the &ldquo;biography&rdquo; of all who have used it or visited it, positions it as an example of intertextuality. Maeshowe has many constantly changing stories to tell to its different readers, and readers will respond differently to its figurative meanings.</p>
+ <p>[6] In a 1977 column for <u>The Orcadian</u> newspaper, George Mackay Brown describes how witnessing the midwinter solstice at Maeshowe affects him: &ldquo;Winter after winter I never cease to wonder at the way primitive man arranged, in hewn stone, such powerful symbolism&rdquo; (&ldquo;Maeshowe at Midwinter&rdquo; 88). Like Richards, Brown is emphasizing the figurative qualities of the structure, which he has further explored in poetry. However, the first of his 1999 &ldquo;Two Maeshowe Poems&rdquo; (often printed as a stand-alone) opens not at the tomb, but with an image of the neighboring stone circle, Brodgar. Perhaps surprising to most readers, this would resonate with archaeologists since current scholarship emphasizes that the sites comprising The Heart of Neolithic Orkney are not self-contained but exist and function in relation to one another and to the surrounding landscape (See &ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; 5). As such, they should not be interpreted as discrete entities. It is fitting, then, that Brown&rsquo;s poem moves seamlessly through a series of images that integrate Brodgar&rsquo;s &ldquo;light and darkness&rdquo; with Maeshowe&rsquo;s &ldquo;flowers [and] stone&rdquo; (a reference to the runic graffiti carved by Vikings inside the tomb) and &ldquo;skulls&rdquo; (Lines 1, 9, 11). The first word of the poem, &ldquo;Circle,&rdquo; is semantically echoed in the initial word of each ensuing stanza, &ldquo;Ring,&rdquo; &ldquo;Wheel,&rdquo; and &ldquo;Round,&rdquo; subtly shifting from the geometrically circular Brodgar to the tumescent mound of Maeshowe and emphasizing the cycle of &ldquo;life and death&rdquo; (7). For this is a poem about regeneration, how &ldquo;Out of those skulls / Breaks the first green shoot, the full ear, then the bread&rdquo; (11-12). Throughout, juxtaposed images look for the positive to outweigh the negative: &ldquo;We move in shadows,&rdquo; but &ldquo;Brodgar has burned on the moor a dance of sun&rdquo;; &ldquo;Ring of quern and plough&rdquo; (a quern is a stone for grinding grain) are charged to &ldquo;contain / Our tumults of blood&rdquo;; &ldquo;The stars&rsquo; chaos is caught in a strict rein&rdquo;; the word &ldquo;stone&rdquo; is enveloped by &ldquo;flowers,&rdquo; and &ldquo;beauty and love&rdquo;; similarly, &ldquo;snow&rdquo; is flanked by &ldquo;sun&rdquo; and &ldquo;seed.&rdquo; So darkness becomes light, destructive violence is subservient to the raising and grinding of grain for bread, order makes sense of the universe, the beautiful and the warm temper the hard and the cold, and new life will follow death.</p>
+ <p>[7] Brown&rsquo;s interpretation of these monuments, his use of the architectural circularity and roundness of the Ring of Brodgar and Maeshowe as metaphors for the lifecycle and the possibility of renewal, is shared by archaeologists, who despite its being a burial site, have also associated Maeshowe and its rituals with the agricultural year. Neolithic people were not nomadic but had gradually become settled farmers, living by the routines and rhythms of the seasons, which, according to Richards, constituted &ldquo;an analogy with the human life cycle and past generations&rdquo; (&ldquo;Doorways&rdquo; 65). Time&rsquo;s passage was the organizational framework for survival as well as mortality, and the tombs, he writes, were &ldquo;a metaphorical extension of daily life&rdquo; (&ldquo;Doorways&rdquo; 76). Trevor Garnham, an architect, develops that idea further: &ldquo;Burying bones in the earth was perhaps to seek some metaphoric relationship with the planting of seeds. In its maturity and death, the seed containing the essence of its own renewal served as the inspiration for the hope of life&rsquo;s rebirth in some other form&rdquo; (87). In pairing skeletal remains with seeds as an expression of hope for the future, Garnham&rsquo;s analogy is comparable to the positive final image of Brown&rsquo;s poem, the &ldquo;skulls&rdquo; engendering the &ldquo;green shoots&rdquo; and the &ldquo;bread&rdquo; of life.</p>
+ <p>[8] Brown had written earlier of Maeshowe in his 1996 poem, &ldquo;Maeshowe: Midwinter,&rdquo; choosing then to focus on the solstice. However, the imagery here is not rooted in the agricultural cycle, the earthly world of querns, ploughs, and bread; instead, he connects the pre-Christian tomb to the Christian calendar. The opening phrase, &ldquo;Equinox to Hallowmass,&rdquo; immediately integrates the astronomical with the sacred, giving the season of &ldquo;darkness&rdquo; both physical and spiritual dimensions (1). The religious imagery continues in the second stanza as it evokes &ldquo;St Lucy,&rdquo; whose feast day falls on the shortest day of the year (6). She is portrayed as a weaver whose &ldquo;shuttle&rdquo; creates &ldquo;a dark web&rdquo; that &ldquo;fills the loom&rdquo; (7-9), placing at the centre of the poem a world in which light is completely absent: &ldquo;The blackness is solid as a / stone that locks a tomb. / No star shines there&rdquo; (10-12). To be in such a void, with no guiding star, would seem like a moment of psychological despair, yet just as the days begin to lengthen immediately after the solstice, the poem also brightens. The moment when the sun enters the passage is the &ldquo;true ceremony,&rdquo; suggesting that perhaps the pagan reverence for nature carries particular authenticity. Then &ldquo;the last fleeting solstice flame&rdquo; is &ldquo;caught up,&rdquo; leading to an optimistic note as the children&mdash;the future&mdash;sing with &ldquo;voices like leaves of light&rdquo; (19). Again, the poem ends with an image of rebirth, but its tone is less biological and more cosmological.</p>
+ <p>[9] While Brown&rsquo;s poems use these dual frames of reference in order to explore the themes of regeneration that Maeshowe expresses, the biological and cosmological are not at odds. Garnham defines the cosmos as &ldquo;an all-encompassing world of things and phenomena [. . . .] The essential character of this early form of cosmos bound every aspect of a people&rsquo;s life into reciprocal relationships with the forces that give shape to their world&rdquo; (9). The central argument of his book places Neolithic Orkney in this context. Similarly, reading Brown&rsquo;s two Maeshowe poems together reveals that the &ldquo;green shoot&rdquo; which produces the &ldquo;bread&rdquo; corresponds to the youthful &ldquo;voices like leaves of light.&rdquo; In fact, his insertion of &ldquo;leaves,&rdquo; with its agrarian connotations, into that final line establishes the connection, recognizes that the complex architectural system of domestic houses, burial chambers, and stone circles symbolizes the idea that the activities for which they were designed&mdash;working, eating, loving, sleeping, worshipping, dying, and the possibility of rebirth&mdash;are the web of human existence. The physical bread and the metaphysical song are one.</p>
+ <p>[10] In their respective responses to Maeshowe, Kathleen Jamie, Myra Schneider, and Dilys Rose also address the theme of the cycle of life and death. Jamie&rsquo;s essay, &ldquo;Darkness and Light,&rdquo; describes a quest: she seeks a good, positive darkness because, in the 21st century, it has become impossible &ldquo;to see the real dark for the metaphorical dark . . .the death-dark.&rdquo; Enjoyment of the &ldquo;natural, courteous dark,&rdquo; she has come to believe, has been squeezed out by the Christian belief in a metaphorical darkness that stands for the opposite of salvation (9-10). However, as she is planning this trip, a friend points out that &ldquo;Maes Howe is a metaphor,&rdquo; perhaps exposing a flaw in Jamie&rsquo;s thinking: possibly the natural and metaphorical darknesses are inseparable (10 emphasis added). Although her visit to Maeshowe takes place a couple of days before the solstice, the artificial lights of a surveyor&rsquo;s crew assault her eyes, so she rediscovers no &ldquo;courteous darkness&rdquo; and witnesses &ldquo;no resurrecting beam of sunlight&rdquo; (19). Nevertheless, through Maeshowe, she becomes reconciled to the conventional negative concept of darkness. In terms of &ldquo;wonder&rdquo; similar to Brown&rsquo;s in <u>The Orcadian</u>, she asks, &ldquo;Were they the first people . . . to articulate this metaphor of light and dark, of life and death?&rdquo; and reflects upon its significance:</p><blockquote>For five thousand years we have used darkness as the metaphor of our mortality. We were at the mercy of merciless death, which is darkness. When we died, they sent a beam of midwinter light in among our bones. What a tender, potent gesture. In the Christian era, we were laid in our graves to face the rising sun. We&rsquo;re still mortal, still don&rsquo;t want to die, don&rsquo;t want our loved ones to die. (19-20)
+ </blockquote>
+ <p>Her rejection of a metaphor that she has considered &ldquo;[worn] out&rdquo; and &ldquo;redundant&rdquo; (4, 9) turns out to have been less literary and more personally psychological, for Jamie&rsquo;s visit to the tomb leads to her acceptance of mortality. Whereas previously she has blamed Christianity, she now appreciates that the Christian concept of darkness is part of a continuum of dread traceable back to Neolithic times and forward to our own. The &ldquo;tender, potent gesture&rdquo; of the light penetrating the dark of the tomb, therefore, offers consolation, ameliorating our most profound fears (20).</p>
+ <p>[11] In her poem, &ldquo;Maeshowe,&rdquo; Myra Schneider also describes a guided tour of the cairn, during which the speaker uses the second person singular to address a hypothetical visitor, initially giving the sense that to enter the burial place feels like death as the &ldquo;chill seeps into your body&rdquo; (14). However, this ominous impression is immediately dismissed because &ldquo;a stillness that&rsquo;s other than death inhabits / this place where the undead gather to greet the dead&rdquo; (15-17). The journey through the passage will take &ldquo;you&rdquo; to a place that is not oblivion but, instead, is where the living may consort with their ancestors. Again, the boundary between life and death, which can seem so irrevocable, becomes less absolute and, therefore, less threatening. After the visit is over, its impact will remain, and the speaker imagines her visitor&rsquo;s memories:</p><blockquote>In midwinter you&rsquo;ll visualize the sun piercing the dark that swaddles seeds, see it falling on the aligned entrance, its white shine splitting to burnish the passage wall, flood the ground with gold. (22-26)
+ </blockquote>
+ <p>These images recall Garnham&rsquo;s theory: that the burial of bones is connected metaphorically to the planting of seeds. In the speaker&rsquo;s memory, the dark cradles seeds, the germ of life, rather than bones. Once sunlight enters the tomb, a radiant moment occurs in which the &ldquo;ground&rdquo; will turn &ldquo;gold,&rdquo; like a field of ripe grain. Schneider&rsquo;s poem, like Brown&rsquo;s, affirms the archaeological reading of Maeshowe as a place of renewal, but in this case that renewal goes beyond the promise of the agricultural cycle. An individual will be able to experience, perhaps during times of psychological or spiritual gloom, the moment of glory when the sun is &ldquo;piercing / the dark.&rdquo; There is a Romantic quality to these lines: Maeshowe will stay with Schneider&rsquo;s speaker as those daffodils stay with Wordsworth, &ldquo;to flash upon the inward eye / That is the bliss of solitude,&rdquo; to stimulate the imagination (24). Having herself benefited from the tomb&rsquo;s restorative qualities, the speaker is inspired to spread the word, to share her revelation with &ldquo;you,&rdquo; the reader.</p>
+ <p>[12] Besides the drama of the solstice, another inspirational feature of Maeshowe is the Viking runes carved on the interior walls. Referring to these inscriptions as &ldquo;The first island poems,&rdquo; Brown quotes them emphatically in the second of the paired poems: &ldquo;INGIBIORG IS THE LOVELIEST GIRL / HERMUND WITH A HARD AXE CARVED RUNES&rdquo; (&ldquo;Two&rdquo; 13, 18-19). Many have been struck by the simple humanity of these statements, as well as the paradox inherent in this lusty youthful scrawling being hidden in a tomb. Dilys Rose, in &ldquo;Maeshowe Nipple,&rdquo; for instance, lists the prosaic concerns of the Vikings, portraying them as &ldquo;intrepid&rdquo; but also homesick, missing &ldquo;sweethearts and family&rdquo; (4, 9). At the ends of their respective poems, both Brown and Rose emphasize that Maeshowe was merely a temporary shelter for the Vikings: the &ldquo;young seamen climbed out of Maeshowe, / Their nostrils wide to the salt wind&rdquo;; &ldquo;the dragon boats moved on&rdquo; (Brown &ldquo;Two&rdquo; 23-24; Rose 11). Crawling out of the subterranean tomb and heading for further maritime adventures, the men re-enter the world, extending the overall theme of regeneration. Brown, as we have seen, has already linked the tomb with the life-giving promise of &ldquo;the first green shoot, the full ear, then the bread&rdquo; in the first of these paired poems. Rose, in similar terms, also connects the Viking runes with the reassuring knowledge that there will be a crop next year: over the centuries, &ldquo;their tongue / took root and sprouted from invaded soil / green words for <u>Father</u>, <u>Daughter</u>, <u>Bread</u>&rdquo; (11-13). Here, in the final lines, the Viking vocabulary is fresh and verdant, a harbinger of new human life and the grain that nourishes it. Since runic characters are &ldquo;straight-branched&rdquo; (Rose 4), they resemble rows of rudimentary skeletal stick figures which have been buried in the tomb. The bony runes, therefore, have become metaphorical seeds, and Rose&rsquo;s speaker, like Garnham, sees hope in the bone/seed analogy.</p>
+ <p>[13] It is clear, to summarize briefly, that these four creative writers read Maeshowe much as archaeologists and historians of architecture have done, as an expression of hope for the future, particularly in relation to the coming of spring, but also at a more personal level. The texts suggest that to visit these tombs is, as Richards also emphasizes, transformative. Like their ancestors, contemporary visitors are changed, in some manner revitalized, especially if they witness the sun&rsquo;s midwinter alignment, which Brown describes as a &ldquo;pledge of renewal, a cry of resurrection&rdquo; (&ldquo;Maeshowe in Midwinter&rdquo; 88). However, in the work of Jamie, Schneider, and Rose, a further, more political restoration is at work, for all three use images equating Maeshowe with the female body.</p>
+ <p>[14] Kathleen Jamie states early in her essay, &ldquo;We are conceived and carried in the darkness,&rdquo; emphasizing the positive, life-giving qualities of the dark, and inviting the reader to see Maeshowe as a uterus (4). The womb/tomb imagery is developed further when she eroticizes the winter solstice as &ldquo;a complicit kiss,&rdquo; during which &ldquo;the beam of the setting sun shines along the passage, and onto the tomb&rsquo;s back wall&rdquo; (12). When she goes inside the tomb, she expects &ldquo;not utter darkness, but perhaps a wombish red&rdquo;; however, this is denied her because of the lights of the surveyors, one of whom is &ldquo;folded, foetus-like, into the little cell in the back wall&rdquo;: a foetus implanted in the very place where the sunbeam strikes (12,13). When Jamie leaves, she describes taking &ldquo;the smallest and most challenging of journeys, squeezing down a passageway and out into the world of sound and moving air&rdquo; (17). The tunnel that admits the beam has become a birth canal, so Jamie&rsquo;s transformation is not only her intellectual reassessment of the metaphorical value of darkness; she visualizes her own rebirth in more literal terms too, with Maeshowe cast as the mother.</p>
+ <p>[15] Myra Schneider&rsquo;s &ldquo;Maeshowe&rdquo; also hints that to visit the tomb is to return to the womb when the speaker remarks that although &ldquo;you&rdquo; are part of a tour group, you will realize that you are &ldquo;alone&rdquo; and have &ldquo;never travelled so far back / so far in&rdquo; (8-10). This analogy is made more explicit later in the poem when the sun enters the passage: &ldquo;In that deep chamber / you&rsquo;ll be bathed in red, not the red spilt in hatred&mdash;/the red that&rsquo;s birth, the heart looming with the blood&rdquo; (24-28). In the vision that the speaker evokes for the visitor&rsquo;s memory, therefore, the &ldquo;dark that swaddles seeds&rdquo; not only nurtures and protects the grain that will ripen into crops, but also the fertilized ovum (23). With no dazzling and intrusive surveyors&rsquo; lights, Schneider suggests that it is possible for us to experience the &ldquo;wombish red&rdquo; that was denied Jamie, blood that is the force of life rather than the mark of violence.</p>
+ <p>[16] Dilys Rose&rsquo;s poem, &ldquo;Maeshowe Nipple,&rdquo; on the other hand, in addressing the Viking use of the tomb, acknowledges that violence has taken place. The title, of course, immediately signals that Maeshowe is female, and the opening lines graphically describe the tomb&rsquo;s external anatomy: a &ldquo;breast,&rdquo; with an &ldquo;aureola / sandy-rimmed, the nipple leaking a pale trail / to hidden chambers&rdquo; (1-3). Within, Maeshowe&rsquo;s chambers have been &ldquo;invaded&rdquo; by men who &ldquo;inscribed their conquests&rdquo; and &ldquo;totted up the loot&rdquo; (12, 4, 6). Even though the poem has initially compared the cairn to a breast rather than a womb, this seems like a rape or an assault by men exercising their power and keeping track of their plunder. As human and homesick as the poem presents the young men, it does not forget that their presence in Maeshowe is as uninvited intruders who leave their runic seeds carved into the chamber walls.</p>
+ <p>[17] To make sense of this pattern of imagery, it is helpful to turn to an earlier female author, similarly inspired by her visit to a Neolithic site. Naomi Mitchison wrote <u>Early in Orcadia</u> after a friend took her to another of Orkney&rsquo;s chambered tombs, Isbister, which has no passage entry, because &ldquo;she knew it would waken something in me&rdquo; (8). Set in Neolithic times, the novel follows a family and its descendants as they settle on Orkney, establish homes and villages, and erect the monuments in which they practice their religious rituals. Mitchison depicts the cairns predating the stone circles (both Isbister and Maeshowe are, in fact, thought to have been built before Brodgar) and imaginatively describes the changing beliefs prompting these architectural developments. Tradition holds that pregnant women must visit the tomb in order that the ancestral spirit will be passed to their children (132). One woman, Ba, making this journey, reflects that a &ldquo;few moons&rdquo; have passed since she became pregnant and stopped menstruating. She also knows that a powerful goddess, &ldquo;the big bad Moon Woman had once had an honouring place,&rdquo; had watched over the dead (119). However, the Moon Woman has been supplanted by the sun. The burial place was &ldquo;pulled apart and scattered by the Sun Man and the bulls. After that came the beginning of their own honouring place where the bones lay and where you must go down on your knees before you could get in&rdquo; (119). The later passage cairn, then, is a creation of the masculine sun, the same sun that shines down the passageway at midwinter. Accompanied by bulls, also male, the Sun Man has ravaged the Moon Woman&rsquo;s tomb and designed a new one to suit his own needs. Even so, the burial place is still associated with female fertility. Nervously, Ba enters &ldquo;on her hands and knees . . . under and between great stones.&rdquo; Once inside, though, she thinks of the moments before she conceived her child: &ldquo;She was waiting, almost as she had waited in the soft sand behind that rock in the sun-warmed geo a few moons back&rdquo; (130). For Ba, the tomb is not frightening. She recalls not a violent rape, but a loving encounter, and the darkness feels as warm as the &ldquo;geo&rdquo; (an Orcadian word referring to a deep, narrow fissure in a cliff) where she met her lover. Following her memory of the moment of conception, she is &ldquo;push[ed] . . . back, back to the way out, back to the square of light, to the way out into the real world on hands and knees as one must&rdquo; (130). Like Jamie, Ba is compelled to crawl, to battle her way through the passage to be reborn.</p>
+ <p>[18] By the end of <u>Early in Orcadia</u>, the stone circle, with its emphasis on light rather than dark, is becoming the ultimate manifestation of the transfer of power from the Moon Woman to the Sun Man. Its significance is explained by the &ldquo;Great Man,&rdquo; who is &ldquo;painted with sun circles,&rdquo; to Moon Woman after he has summoned her to his presence: &ldquo;The great tall stones . . . were so raised to show the way of the sun, who is our master and our maker&rdquo; (169). Moon Woman, however, is aware of the injustice of this arrangement: &ldquo;They said that the moon was the servant of the sun, to do what he wanted, but that, Moon Woman knew, was not right. In her own mind she unsaid it&rdquo; (170). At first she is jealous and afraid, but the final vision of the novel is hers, and it is, to an extent, a reconciliation of powers:</p><blockquote>If I were to say a few small and easy words to the Great Man, if I were to move myself in a certain way, then we would be sun and moon. Then I would put my fingers onto the colour, onto that knife, onto his eyes, . . . eyes, onto that round, shining sun that hangs over his heart, fingering it so that my fingers would meet his, me going . . . onto all parts of him. He would be mine as the sun is the moon&rsquo;s. (176)
+ </blockquote>
+ <p>She is picturing an intertwining of sun and moon, of masculine and feminine&mdash;a consummation. The partnership is not one of complete equality, though, for she also envisions not that the sun will be the master and the moon the servant, but that he will be hers, that the moon will possess the sun, that her status will be restored.</p>
+ <p>[19] Mitchison&rsquo;s fictional representation of light/sun/man emerging as the object of worship and awe, assuming the rank previously held by dark/moon/woman, is an idea rooted across cultures: &ldquo;A fundamental polarity in many creation myths,&rdquo; according to Trevor Garnham, &ldquo;contrasts the dark, fecund, harbouring earth with the up-drawing sun.&rdquo; (145). He points out, for example, that &ldquo;by the time of the Celtic occupation of Britain, there were well-established beliefs and practices focused on the sun&rdquo; and that in Norse mythology, &ldquo;a male hierarchy supplanted older, matriarchal law&rdquo; (161, 109). Analyzing the archaeological sites within this paradigm, Garnham argues, supports the theory that religious practice fundamentally changed along with the architecture, that &ldquo;ritual activity associated with burial cairns became transferred to stone circles&rdquo; (152).</p>
+ <p>[20] Maeshowe, however, suggests a mid-point in this ritualistic shift because although, like earlier stalled cairns, it is dark and womb-like, its annual climactic moment is when the sun lights up the passage. Garnham sees the Neolithic architecture of Orkney as a progression. The first structures, the houses, were purely domestic; they had a &ldquo;nurturing role&rdquo; (66). The houses at the coastal village site, Scara Brae, therefore, &ldquo;seem to be fundamentally powerful symbols of protection and gathering, echoing that of the pot and the basket&rdquo; (70). Since the manufacture of both pots and baskets was the work of women, Garnham is reading the houses as essentially feminine. They were vessels, their stone walls embanked by earth. Both Garnham and Richards point out that the houses were models for the tombs: the passage graves are structurally similar to the houses at Scara Brae, and both were covered with turf (Garnham 48; Challands, Muir &amp; Richards 242, 245). Cairns of the Maeshow type, with passage entries, however, were the later forms. The earlier stalled structures, such as Midhowe, on the island of Rousay, did not feature the tunnel entrance.</p>
+ <p>[21] Archaeologists do not agree on the social significance of passage cairns and sun circles, the extent to which their development reveals a move to a more hierarchical society. Challands, Muir, and Richards state, &ldquo;In many ways, everything about the architecture of Maeshowe enforces a notion of separation, division, and restriction&rdquo; (247). Elsewhere, Richards and another co-writer are more guarded. They point out that the tomb resembles House 2 at the nearby Barnhouse settlement, a larger house than any at Scara Brae that was probably &ldquo;highly restricted on the basis of an individual&rsquo;s status, probably additionally defined in terms of age and gender.&rdquo; However, they also warn that there is insufficient archaeological evidence to &ldquo;leap to conclusions about a patriarchal group of &lsquo;elders&rsquo; who used knowledge as a commodity to maintain their power over women and younger men&rdquo; (Muir &amp; Richards 204). Although cautious, they do acknowledge that &ldquo;power and authority,&rdquo; probably based on &ldquo;cosmological beliefs,&rdquo; would have been necessary to build the monuments (199). Leaning not only on physical but also anthropological evidence, Garnham&rsquo;s view, on the other hand, is that the more formal structure <u>does</u> support the idea of hierarchy and that the estimated 100,000 man/hours that would have been necessary to build it point to a more complex social structure that had to extend beyond the local community (128). Furthermore, he writes, the layout of individual chambers &ldquo;can be read as a metaphor of primogeniture&rdquo; (74). Like Richards, Garnham interprets the passage as a symbol of privilege because it was hard to get inside. However, citing Eliade&rsquo;s <u>Patterns in Comparative Religion</u>, he also emphasizes that there is &ldquo;a close connection between solar theology and the elite&rdquo; (163). In this context it seems that &ldquo;allowing access to the sun . . . was more important that [sic] allowing access to members of the tribe&rdquo; (131-132).</p>
+ <p>[22] Maeshowe can be seen, then, as expressing a point of tension between earth and sun in which the dark tomb is literally infiltrated by solar rays on one day only. The subsequent building of the Circle of Brodgar elevates the stature of the sun. Fully above ground, the center of its astronomical and religious year occurs not in December, but in June, at the midsummer solstice. Garnham points out that while a smaller circle, the Stones of Stenness, is open to the sun at its &ldquo;point of maximum power,&rdquo; Maeshowe allows the sun inside only when it is &ldquo;at its lowest ebb.&rdquo; Except at midwinter, &ldquo;the tomb is dark, cold, and filled with white bones, echoing the whiteness of the moon&rdquo; (207). Although Stenness actually predates Maeshowe by perhaps 400 years, throwing off the neat chronology of <u>Early in Orcadia</u>, Garnham&rsquo;s interpretation of Maeshowe and the stone circles parallels Mitchison&rsquo;s literary response to the Isbister tomb: compared to earlier cairns, Maeshowe is a more patriarchal development, the passageway allowing the masculine sun to displace the feminine &ldquo;whiteness of the moon,&rdquo; and yet the bones, the metaphorical seeds, still lie dormant; the presence of Moon Woman endures.</p>
+ <p>[23] Although <u>Early in Orcadia</u> ends with Moon Woman&rsquo;s vision of a mingling of sun and moon, of masculine and feminine, there is a note of uncertainty as she asks herself, &ldquo;Should I, then?&rdquo; (176). She does not ask &ldquo;Can I?&rdquo; but &ldquo;Should I?&rdquo; Her question is not whether she is personally capable, but whether it would be wise to challenge the elite power structure in the name of justice. Readers are left without an answer, but since women are still fighting for equality in the institutions of politics and religion, it is reasonable to assume that if Moon Woman did attempt it, she met with a great deal of resistance. It is with this in mind, then, that we can return to the Maeshowe experiences of Jamie, Schneider and Rose. Their visits to the cairn suggest that to see it merely as a symbol of agricultural regeneration or even more broadly of hope, is incomplete. Something more needs to be resurrected, and their use of the female imagery effectively acknowledges and reclaims a feminine narrative for Maeshowe. In Rose&rsquo;s poem, 12th century Vikings may take up residence inside, but 900 years later, the reader is instructed to &ldquo;See,&rdquo; to bear witness to &ldquo;a green breast in a green field,&rdquo; the most nurturing part of a woman&rsquo;s body surrounded by the new growth of spring (1). When Schneider refers to the &ldquo;red that&rsquo;s birth&rdquo; rather than the &ldquo;red spilt in hatred,&rdquo; and describes how the sun will &ldquo;burnish the passage wall, / flood the ground with gold&rdquo; and, similarly, when Jamie refers to the &ldquo;complicit kiss,&rdquo; it is as if Moon Woman&rsquo;s consummation has finally taken place and justice restored.</p>
+ <p>[24] Richards asks where the doors of tombs lead, to what &ldquo;revelation.&rdquo; Indeed, the creative writing of Jamie, Schneider, and Rose transports readers through Maeshowe&rsquo;s entryway towards &ldquo;revelation.&rdquo; Their collective responses help us to recognize the humanity of Neolithic peoples, to appreciate how common experiences connect us to the past. They ask us to consider the roots of sexual discrimination, the possible marginalization of women 5000 years ago. More universally, they honor the memory of displaced matriarchal societies and, thus, prompt us to reflect on the status of women today. While, as Hall points out, male authors of the mid-twentieth-century Scottish Literary Renaissance had a nationalist political agenda, &ldquo;looking for Scotland in Scotland&rsquo;s prehistory&rdquo; (160), these female writers look to the past for a feminist renewal, both personal and political. As such, their interpretations complement and illuminate those of archaeologists. Naomi Mitchison, acknowledging that she may be &ldquo;treading on the toes of archaeologists,&rdquo; points out that their physical &ldquo;evidence may not always offer a clear interpretation, in fact it very seldom does&rdquo; (113). For despite their painstaking sifting (both literal and figurative) of physical evidence, archaeologists must, finally, apply their own imaginations.</p>
+ <p>[25] Archaeologists themselves recognize the uncertainty inherent in drawing conclusions about ancient societies from the surviving fragments of their lives. In reference to the recent discovery of a complex of temples at the Ness of Brodgar, Richards has said, &ldquo;This was a ceremonial centre, and a vast one at that. But the religious beliefs of its builders remain a mystery&quot; (qtd. in McKie). In fact, the excavation of this temple complex is prompting a reassessment of the entire Heart of Neolithic Orkney. Tom Muir, of the Orkney Museum, goes so far as to assert that &quot;the whole text book of British archaeology for this period will have to be torn up and rewritten from scratch thanks to this place&quot; (qtd. in McKie). Even as archaeologists, using sophisticated technology, scrape away the dust of time from this long-buried site, it remains true that &ldquo;Insights can only come from interpretation&rdquo; (Jones and Richards 195). It is in this interpretative arena that science must join forces with the arts and humanities in the search for knowledge, for a fuller understanding.</p>
+ <p>[26] George Mackay Brown has written, &ldquo;People in 2000 AD are essentially the same as the stone-breakers [. . .] of 3000 BC&rdquo; (&ldquo;Brodgar Poems&rdquo; lines 10-12). Knowing where we have come from, fleshing out our understanding of the prehistoric world and, therefore, ourselves, takes the skills and multiple perspectives not only of scientists, archaeologists, architects, and anthropologists, but also essayists, poets, and more. The interdisciplinary synergy involved in comparing archaeological, anthropological, and literary interpretations of Maeshowe sheds light on the shadows of the past, raises questions about the more elusive shadows of Neolithic women, and provides historical context for our understanding of gender relations across time. Like crawling through the passage into the dark and out to the light, the empirical and literary journeys into the mysteries of Maeshowe are indeed transformative, exhuming the bones of the past that we may better nurture the seeds of the future.</p>
+ <p>ACKNOWLEDGEMENTS. Thanks are due to Edward Gale Agran, Stephen Potthoff, and the anonymous reviewers for their time and valued advice. </p>
+ <p align="center">WORKS CITED</p>
+ <p>Bevan, Archie, and Brian Murray. Eds. <u>The Collected Poems of George Mackay Brown</u>. London: John Murray, 2005. Print.</p>
+ <p>Brown, George Mackay. &ldquo;Brodgar Poems (1992).&rdquo; In Bevan and Murray.308-312. Print.</p>
+ <p>---. &ldquo;Maeshowe: Midwinter.&rdquo;1996. In Bevan and Murray. 320. Print.</p>
+ <p>---. &ldquo;Maeshowe at Midwinter.&rdquo; 1977. <u>Under Binkie&rsquo;s Brae</u>. Edinburgh: Gordon Wright Publishing, 1979. 87-88. Print.</p>
+ <p>---. &ldquo;Two Maeshowe Poems.&rdquo; 1999. In Bevan and Murray. 420-421. Print.</p>
+ <p>Card, Nick, et al. &ldquo;Bringing a Landscape to Life? Researching and Managing &lsquo;The Heart of Neolithic Orkney&rsquo; World Heritage Site.&rdquo; <u>World Archaeology</u> 39.3 (2007): 417-435. EBSCO <u>Academic Search Complete</u>. Web. 29 Jun. 2011.</p>
+ <p>Challands, Adrian, Tom Muir, and Colin Richards. &ldquo;The Great Passage Grave of Maeshowe.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 229-248. Print.</p>
+ <p>Crawford, Robert. &ldquo;Maes Howe Sappho.&rdquo; <u>Yale Review</u>: 95.1 (2007): 60-65. OhioLINK Electronic Journal Center. Web. 29 Jun. 2011.</p>
+ <p>Garnham, Trevor. <u>Lines on the Landscape, Circles from the Sky: Monuments of Neolithic Orkney</u>. Stroud, Gloucestershire: Tempus, 2004. Print.</p>
+ <p>Hall, Simon W. <u>The History of Orkney Literature</u>. Edinburgh: John Donald/Birlinn Ltd., 2010. Print.</p>
+ <p>&ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; Historic Scotland. 2008. EBSCO <u>Academic Search Complete</u>. Web. 30 Jun. 2011.</p>
+ <p>Jamie, Kathleen. &ldquo;Darkness and Light.&rdquo; <u>Findings: Esssays on the Natural and Unnatural World</u>. Ed. Jamie. St. Paul, MN: Graywolf, 2005. 3-22. Print.</p>
+ <p>McKie, Robin. &ldquo;Neolithic Discovery: Why Orkney is the Centre of Ancient Britain.</p>
+ <p><u>The Guardian / The Observer</u>. 6 Oct. 2012. Web. 16 Mar. 2013.</p>
+ <p>Mitchison, Naomi. <u>Early in Orcadia</u>. Glasgow: Richard Drew, 1987. Print.</p>
+ <p>Jones, Si&acirc;n, and Colin Richards. &ldquo;The Villagers of Barnhouse.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 195-204. Print.</p>
+ <p>Richards, Colin. &ldquo;Doorways into Another World: The Orkney-Cromarty Chambered Tombs.&rdquo; <u>Vessels for Ancestors: Essays on the Neolithic of Britain and Ireland in Honour of Audrey Henshall</u>. Ed. Niall Sharples and Alison Sheridan. Edinburgh: Edinburgh UP, 1992. 62-76. Print.</p>
+ <p>Riddoch, Lesley. &ldquo;Stone Age Marvels Which Inspire and Astonish: Wonders of Scotland.&rdquo; <u>The Scotsman</u>. 13 Feb. 2006. Web. 30 Jun. 2011.</p>
+ <p>Rose, Dilys. &ldquo;Maes Howe Nipple.&rdquo; <u>Bodywork</u>. Edinburgh. Luath Press, 2007. Print.</p>
+ <p>Schneider, Myra. &ldquo;Maeshowe.&rdquo; <u>Circling the Core</u>. London: Enitharmon Press, 2008. 23-24. Print.</p>
+ <p>Wordsworth, William. &ldquo;I wandered lonely as a cloud.&rdquo; <u>The Norton Anthology of English Literature</u>. Eighth Ed. Ed. Stephen Greenblatt and M.H. Abrams. New York: Norton, 2006. 305-306. Print.</p>
+<p><strong>Contributor's Note</strong></p>
+ <p><strong>CHARLOTTE FAIRLIE</strong> teaches English at Wilmington College, in Wilmington, Ohio. Her published work focuses on Scottish literature and rural life in literature. She is currently co-editing an anthology of poetry relating to scythes and mowing.</p></td>
+ <td valign="top"><center>
+ <a href="../index.html"> <img src="../image/btncu.gif" alt="Current Issue" border="0" height="42" width="79"></a><br>
+ <a href="../download.html" tppabs="http://www.genders.org/download.html"> <img src="../image/btndo.gif" alt="Download" tppabs="http://www.genders.org/image/btndo.gif" align="bottom" border="0" height="42" width="115"></a><br>
+ <a href="../edit.html" tppabs="http://www.genders.org/edit.html"> <img src="../image/btned.gif" alt="Editorial Board" tppabs="http://www.genders.org/image/btned.gif" align="bottom" border="0" height="50" width="80"></a><br>
+ <a href="../guide.html" tppabs="http://www.genders.org/guide.html"> <img src="../image/btngu.gif" alt="Contributor Guidelines" tppabs="http://www.genders.org/image/btngu.gif" align="bottom" border="0" height="42" width="90"></a><br>
+ <a href="../recent.html"> <img src="../image/btnre.gif" alt="Recent Issues" tppabs="http://www.genders.org/image/btnre.gif" align="bottom" border="0" height="41" width="79"></a><br>
+ <a href="../link.html"> <img src="../image/btnli.gif" alt="Links &amp; Books" border="0" height="46" width="97"></a><br>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <table width="500">
+ <tbody>
+ <tr>
+ <td><p><a href="../download.html">Copyright</a> ©2010 Ann Kibbey.
+
+ All Rights Reserved Worldwide.<br>
+ </p>
+ <p> </p>
+ <center>
+ <a href="../download.html"><font size="1">Download</font></a><font size="1"> || <a href="../edit.html">Editorial Board</a> || <a href="../guide.html">Submission
+
+ Guidelines</a> || <a href="../index.html">Current Issue</a> || <a href="../recent.html">Recent Issues</a> || <a href="../link.html">Links
+
+ &amp; Books</a></font>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <p></p>
+ <p align="right">
+
+ <table width="550">
+ <tbody>
+ <tr>
+ <td width="361"></td>
+ <td width="72"><p><img src="../image/algosmlr.gif" alt="Genders" align="bottom" border="0" height="72" width="72"> </p></td>
+ <td width="101"><b> <font size="1">Genders Journal</font></b> <font size="1"><br>
+ 226 UCB<br>
+ University of Colorado<br>
+ Boulder, CO 80309<br>
+ http://www.Genders.org</font></td>
+ </tr>
+ </tbody>
+ </table>
+ </p>
+ <p align="right"></p></td>
+ </tr>
+ </tbody>
+</table>
+</p>
+<p></p>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/nature_article.html b/python/tests/files/nature_article.html
new file mode 100644
index 0000000..177da83
--- /dev/null
+++ b/python/tests/files/nature_article.html
@@ -0,0 +1,1379 @@
+
+
+
+
+
+
+
+
+<!DOCTYPE html>
+<html lang="en" class="grade-c">
+<head>
+ <meta charset="utf-8">
+<link rel="dns-prefetch" href="//ajax.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.gstatic.com"/>
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
+
+ <title>More than 100 scientific journals have disappeared from the Internet</title>
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+ <meta property="og:type" content="article"/>
+ <meta property="og:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta property="og:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+ <meta name="twitter:card" content="summary_large_image"/>
+ <meta name="twitter:site" content="@nature"/>
+ <meta name="twitter:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta name="twitter:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta name="twitter:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+
+
+ <meta name="journal_id" content="41586"/>
+
+ <meta name="dc.title" content="More than 100 scientific journals have disappeared from the Internet"/>
+
+ <meta name="dc.source" content="Nature 2020"/>
+
+ <meta name="dc.format" content="text/html"/>
+
+ <meta name="dc.publisher" content="Nature Publishing Group"/>
+
+ <meta name="dc.date" content="2020-09-10"/>
+
+ <meta name="dc.type" content="News"/>
+
+ <meta name="dc.language" content="En"/>
+
+ <meta name="dc.copyright" content="2020 Nature"/>
+
+ <meta name="dc.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="dc.description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="prism.publicationName" content="Nature"/>
+
+ <meta name="prism.publicationDate" content="2020-09-10"/>
+
+ <meta name="prism.section" content="News"/>
+
+ <meta name="prism.startingPage" content=""/>
+
+ <meta name="prism.endingPage" content=""/>
+
+ <meta name="prism.copyright" content="2020 Nature"/>
+
+ <meta name="prism.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="prism.url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+
+ <meta name="prism.doi" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="dc.identifier" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="DOI" content="10.1038/d41586-020-02610-z"/>
+
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="dc.creator" content="Diana Kwon"/>
+
+ <meta name="dc.subject" content="Publishing"/>
+
+
+
+<script>(function(e){var t=e.documentElement,n=e.implementation;t.className='js';if(n&&n.hasFeature('http://www.w3.org/TR/SVG11/feature#Image','1.1')){t.className+=' svg'}})(document)</script>
+<link rel="stylesheet" href="/static/css/mosaic-grade-c.26f07b2f11.css">
+
+<link rel="stylesheet" class="js-ctm" href="/static/css/magazine-mosaic-150.7f46c29843.css" media="only screen, print and (-webkit-min-device-pixel-ratio:0) and (min-color-index:0), (-ms-high-contrast: none), only all and (min--moz-device-pixel-ratio:0) and (min-resolution: 3e1dpcm)">
+
+
+ <style>
+ .c-header--brand-border {
+ border-bottom: 5px solid #000;
+ }
+ </style>
+
+<link rel="apple-touch-icon" sizes="180x180" href=/static/images/favicons/nature/apple-touch-icon.f39cb19454.png>
+<link rel="icon" type="image/png" sizes="32x32" href=/static/images/favicons/nature/favicon-32x32.3fe59ece92.png>
+<link rel="icon" type="image/png" sizes="16x16" href=/static/images/favicons/nature/favicon-16x16.951651ab72.png>
+<link rel="manifest" href=/static/manifest.1a481c42b1.json>
+<link rel="mask-icon" href=/static/images/favicons/nature/safari-pinned-tab.69bff48fe6.svg color="#000000">
+<link rel="shortcut icon" href=/static/images/favicons/nature/favicon.62367f778b.ico>
+<meta name="msapplication-TileColor" content="#000000">
+<meta name="msapplication-config" content=/static/browserconfig.e35b3b052c.xml>
+<meta name="theme-color" content="#000000">
+<meta name="application-name" content="Nature">
+
+<link rel="search" href="http://www.nature.com/search">
+<link rel="search" href="http://www.nature.com/opensearch/opensearch.xml" type="application/opensearchdescription+xml" title="nature.com">
+<link rel="search" href="http://www.nature.com/opensearch/request" type="application/sru+xml" title="nature.com">
+
+ <meta name="WT.cg_s" content="News"/>
+ <meta name="WT.z_cg_type" content="News"/>
+ <meta name="WT.page_categorisation" content="Article page"/>
+ <meta name="WT.z_subject_term" content="Publishing"/>
+
+<meta name="WT.template" content="oscar"/>
+<meta name="WT.cg_n" content="Nature"/>
+<meta name="dc.rights" content="©2020 Macmillan Publishers Limited. All Rights Reserved."/>
+<meta name="WT.z_bandiera_abtest" content="a"/>
+
+ <script data-test="dataLayer">
+ dataLayer = [{"content":{"category":{"contentType":"news","legacy":{"webtrendsPrimaryArticleType":"news","webtrendsSubjectTerms":"publishing","webtrendsContentCategory":null,"webtrendsContentCollection":null,"webtrendsContentGroup":"Nature","webtrendsContentGroupType":null,"webtrendsContentSubGroup":"News"}},"article":{"doi":"10.1038/d41586-020-02610-z"},"attributes":{"cms":"core media","deliveryPlatform":"oscar","copyright":{"open":false,"legacy":{"webtrendsLicenceType":null}}},"contentInfo":{"authors":["Diana Kwon"],"publishedAt":1599696000,"publishedAtString":"2020-09-10","title":"More than 100 scientific journals have disappeared from the Internet","legacy":null,"publishedAtTime":null,"documentType":"aplusplus"},"journal":{"pcode":"nature","title":"nature","volume":null,"issue":null},"authorization":{"status":true},"features":[{"name":"furtherReadingSection","present":false}],"collection":null},"page":{"category":{"pageType":"article"},"attributes":{"template":"magazine mosaic","featureFlags":[{"name":"ab_test_news_feature","active":false}]},"search":null},"privacy":{},"version":"1.0.0","product":null,"session":null,"user":null,"backHalfContent":false}];
+</script>
+
+<script>
+ (function() {
+ function deleteCookie (name, domain) {
+ document.cookie = encodeURIComponent(name) +
+ '=' +
+ ';path=/' +
+ ';domain=' + domain +
+ ';expires=Thu, 01 Jan 1970 00:00:00 GMT';
+ }
+
+ var consentCookieParts = ('; ' + document.cookie).split('; OptanonConsent=');
+
+ if (consentCookieParts.length > 1) {
+ consentCookieParts.shift(); // remove redundant first part from the split array
+
+ // onetrust can set the same cookie multiple times with different domain specificities
+ for (let i=0; i<consentCookieParts.length; i++) {
+ var otCookieGroups = consentCookieParts[i].split('&groups=').pop().split('&').shift();
+
+ if (otCookieGroups.indexOf('C0001') === -1) {
+ deleteCookie('OptanonConsent', 'nature.com');
+ deleteCookie('OptanonAlertBoxClosed', 'nature.com');
+ }
+ }
+ }
+ })();
+</script>
+
+<script>
+ (function(w,d,t) {
+ function cc() {
+ var h = w.location.hostname;
+ if (h.indexOf('preview-www.nature.com') > -1) return;
+
+ var e = d.createElement(t),
+ s = d.getElementsByTagName(t)[0];
+
+ if (h.indexOf('nature.com') > -1) {
+ e.src = 'https://cdn.cookielaw.org/scripttemplates/otSDKStub.js';
+ e.setAttribute('data-domain-script', '83f2c78a-6cbc-4d1a-9088-3f8e8c4c7460');
+ } else {
+ e.src = '/static/js/cookie-consent-bundle.9d49adbc02.js';
+ e.setAttribute('data-consent', h);
+ }
+ s.parentNode.insertBefore(e, s);
+ }
+
+ !!w.google_tag_manager ? cc() : window.addEventListener('gtm_loaded', function() {cc()});
+ })(window,document,'script');
+</script>
+<script>
+ function OptanonWrapper() {
+ window.dataLayer.push({event:'OneTrustGroupsUpdated'});
+ document.activeElement.blur();
+ }
+</script>
+
+
+<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+ new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+ 'https://www.googletagmanager.com/gtm.js?id='+i+dl;
+
+
+ j.addEventListener('load', function() {
+ var _ge = new CustomEvent('gtm_loaded', { bubbles: true });
+ d.dispatchEvent(_ge);
+ });
+
+ f.parentNode.insertBefore(j,f);
+})(window,document,'script','dataLayer','GTM-NWDMT9Q');</script>
+
+
+
+</head>
+<body>
+
+
+
+<div role="banner" class="position-relative cleared z-index-50 background-white" data-test="top-containers">
+
+
+ <a class="c-skip-link u-hide-print" href="#content">Skip to main content</a>
+
+
+
+
+
+
+
+ <aside class="c-ad c-ad--728x90">
+ <div class="c-ad__inner" data-container-type="banner-advert">
+ <p class="c-ad__label">Advertisement</p>
+
+
+
+ <div id="article-doubleclickad-container">
+ <div id="div-gpt-ad-top-1"
+ class="div-gpt-ad advert leaderboard js-ad text-center hide-print grade-c-hide"
+ data-ad-type="top"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="728x90"
+ data-gpt-targeting="type=article;pos=top;artid=d41586-020-02610-z;doi=10.1038/d41586-020-02610-z;subjmeta=479,648,706;kwrd=Publishing">
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing"
+ alt="Advertisement"
+ width="728"
+ height="90"></a>
+ </noscript>
+ </div>
+</div>
+
+
+
+
+ </div>
+ </aside>
+
+
+
+
+
+ <div class="c-grade-c-banner u-hide">
+ <div class="c-grade-c-banner__container">
+
+ <p>Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain
+ the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in
+ Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles
+ and JavaScript.</p>
+
+ </div>
+ </div>
+
+
+
+
+ <header class="c-header c-header--brand-border" id="header" data-header>
+ <div class="c-header__row-border">
+ <div class="c-header__container">
+ <div class="c-header__layout">
+ <a href="/nature"
+ data-track="click" data-track-action="home" data-track-category="nature-150-split-header" data-track-label="image">
+ <picture class="c-header__logo">
+ <source srcset="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" media="(min-width: 769px)">
+ <img src="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" alt="Nature">
+ </picture>
+ </a>
+ <div class="c-header__layout">
+
+ <div class="c-header__site-navigation c-header__site-navigation--show-at-md"
+ data-test="siteindex-link">
+ <a class="c-header__link" href="https://www.nature.com/siteindex"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open nature research index" data-track-label="link">
+ <span>View all Nature Research journals</span>
+ </a>
+ </div>
+
+ <div class="c-header__site-navigation c-header__site-navigation--border">
+ <a class="c-header__link"
+ href="#search-menu"
+ data-header-expander
+ data-test="search-link" data-track="click" data-track-category="nature-150-split-header" data-track-action="open search tray" data-track-label="button">
+ <span>Search</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M16.48 15.455c.283.282.29.749.007 1.032a.738.738 0 01-1.032-.007l-3.045-3.044a7 7 0 111.026-1.026zM8 14A6 6 0 108 2a6 6 0 000 12z"/></svg>
+ </a>
+ <a href="/nams/svc/myaccount"
+ id="my-account"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="my account" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>My Account</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+<a href="https://idp.nature.com/authorize/natureuser?client_id&#x3D;grover&amp;redirect_uri&#x3D;https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z"
+ id="login-button"
+ style="display: none;"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="login" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>Login</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="c-header__container" data-test="c-header__container">
+ <ul class="c-header__menu">
+
+ <li class="c-header__item" data-test="explore-content-button">
+ <a href="#explore"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open explore expander" data-track-label="button">
+ <span>Explore <span class="c-header__show-text">our content</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item">
+ <a href="#journal-info"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open journal information expander" data-track-label="button">
+ <span>Journal info<span class="c-header__show-text">rmation</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item c-header__item--pipe">
+ <a class="c-header__link"
+ href="https://www.nature.com/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-category="nature-150-split-header"
+ data-track-label="link">
+ <span>Subscribe</span>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+
+ </header>
+
+
+
+
+ <div class="u-mb-16">
+ <div class="u-container">
+ <ol class="c-breadcrumbs">
+ <li class="c-breadcrumbs__item" id="breadcrumb0"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb1"><a class="c-breadcrumbs__link"
+ href="/"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:nature"><span itemprop="title">nature</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb1"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb2"><a class="c-breadcrumbs__link"
+ href="/nature/articles?type&#x3D;news"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:news"><span itemprop="title">news</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb2"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb3"><span itemprop="title">article</span></li>
+ </ol>
+ </div>
+ </div>
+
+
+
+
+
+
+</div>
+
+
+ <div id="content" class="article-page position-relative z-index-1">
+ <section class="container highlight-container article-page--news container-with-gap">
+ <article class="article-item article-item--open" itemscope="" itemtype="http://schema.org/NewsArticle"
+ data-track-component="news">
+ <div class="container cleared container-type-article" data-container-type="article" itemprop="articleBody">
+ <div class="content position-relative cleared clear mq1200-padded" data-component="article-container"
+ role="main">
+ <header class="article-item__header clear cleared pull--both">
+ <div class="article__type">NEWS
+ <div class="ml10 article__date">
+ <time itemprop="datePublished">10 September 2020</time>
+ </div>
+ </div>
+
+ <div class="clear cleared"></div>
+ <h1 class="article-item__title serif" itemprop="headline">More than 100 scientific journals have disappeared from the Internet</h1>
+
+ <div class="article-item__teaser-text serif">
+ Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk.
+ </div>
+ </header>
+
+ <div class="clear cleared"></div>
+
+ <div class="bordered-container clear cleared pull--both">
+ <div id="author-affiliations" class="tab-group text14" role="tablist" data-test="author-affiliations" data-tab-group>
+ <div class="cleared">
+
+ <div id="author-affiliation-news-0" class="tab-box js-box-wrapper">
+ <h3 id="author-affiliation-news-0-head" data-track="click" data-track-label="view author info" class="sans-serif strong tab tab-skin ma0" role="tab"
+ aria-controls="author-affiliation-news-0-content" data-tooltip="Show author information">
+ Diana Kwon
+ </h3>
+ <div id="author-affiliation-news-0-content" class="tab-content pin-right grid grid-12 last"
+ role="tabpanel">
+ <div class="pa10" aria-labelledby="author-affiliation-news-0-head">
+ <div class="clear cleared">
+
+
+ <div class="align-left">
+ <h4 class="sans-serif">Search for this author in:</h4>
+ <ul class="ma0 clean-list">
+ <li class="strong"><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd&#x3D;search&amp;term&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Pub Med" >Pub Med</a></li>
+
+ <li class="strong"><a href="https://www.nature.com/search?order&#x3D;date_desc&amp;q&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Nature.com" >Nature.com</a></li>
+
+ <li class="strong"><a href="https://scholar.google.co.uk/scholar?as_q&#x3D;&amp;btnG&#x3D;Search+Scholar&amp;as_sauthors&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Google Scholar" >Google Scholar</a></li>
+ </ul>
+ </div>
+
+
+
+ </div>
+ </div>
+ </div>
+ </div>
+
+ </div>
+</div>
+
+ </div>
+
+ <div class="clear cleared pull--both">
+ <ul class="social clean-list inline-list hide-print">
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="twitter" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="https://twitter.com/intent/tweet?text=More+than+100+scientific+journals+have+disappeared+from+the+Internet&url=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Twitter</title>
+ <desc>Share on Twitter</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M20.8125,11.4875 C21.42,11.10375 21.8875,10.49625 22.105,9.7725 C21.5375,10.1275 20.90875,10.385 20.23875,10.5225 C19.70625,9.9225 18.9425,9.545 18.0975,9.545 C16.475,9.545 15.16,10.9325 15.16,12.6425 C15.16,12.885 15.185,13.1225 15.235,13.3475 C12.7975,13.2175 10.63125,11.985 9.1825,10.11 C8.93,10.56875 8.785,11.10125 8.785,11.66875 C8.785,12.74375 9.30375,13.69125 10.09125,14.2475 C9.61125,14.23125 9.1575,14.09 8.76125,13.86 L8.76125,13.8975 C8.76125,15.3975 9.77375,16.65125 11.11875,16.935 C10.87125,17.0075 10.6125,17.04375 10.34375,17.04375 C10.15625,17.04375 9.96875,17.025 9.79125,16.98875 C10.16625,18.22125 11.24875,19.11875 12.535,19.1425 C11.52875,19.97375 10.2625,20.4675 8.885,20.4675 C8.6475,20.4675 8.415,20.455 8.185,20.42625 C9.485,21.30375 11.02875,21.81625 12.6875,21.81625 C18.09,21.81625 21.04375,17.095 21.04375,13.00125 L21.03625,12.60125 C21.61125,12.16375 22.11125,11.6175 22.50125,10.99625 C21.97375,11.2425 21.4075,11.40875 20.81375,11.48375 L20.8125,11.4875 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="facebook" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Facebook</title>
+ <desc>Share on Facebook</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15.89625,22.8625 L12.57125,22.8625 L12.57125,15.02125 L10.90875,15.02125 L10.90875,12.31875 L12.57125,12.31875 L12.57125,10.69625 C12.57125,8.4925 13.50875,7.18 16.175,7.18 L18.39375,7.18 L18.39375,9.8825 L17.00625,9.8825 C15.96875,9.8825 15.9,10.26 15.9,10.965 L15.895,12.3175 L18.4075,12.3175 L18.115,15.02 L15.89625,15.02 L15.89625,22.8625 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="email" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="mailto:?subject=More than 100 scientific journals have disappeared from the Internet&body=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share via E-Mail</title>
+ <desc>Share via E-Mail</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15,15.3269887 L10.6248577,11.9177869 C10.4236021,11.7609644 10.1299323,11.7927468 9.96892789,11.988775 C9.80792343,12.1848031 9.84055341,12.4708451 10.041809,12.6276676 L14.7012493,16.2584003 C14.8680779,16.3940555 15.1152493,16.4013884 15.2915244,16.2640313 C15.2939898,16.2622325 15.2963784,16.2603294 15.2987507,16.2584003 L19.958191,12.6276676 C20.1594466,12.4708451 20.1920766,12.1848031 20.0310721,11.988775 C19.8700677,11.7927468 19.5763979,11.7609644 19.3751423,11.9177869 L15,15.3269887 Z M9,10 L21,10 C21.5522847,10 22,10.4477153 22,11 L22,19 C22,19.5522847 21.5522847,20 21,20 L9,20 C8.44771525,20 8,19.5522847 8,19 L8,11 C8,10.4477153 8.44771525,10 9,10 Z"></path>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+</ul>
+
+ </div>
+
+
+
+
+ <div class="align-left">
+
+ <div class="article__body serif cleared">
+ <p>Scholarly journals are supposed to provide a lasting record of science. But over the past two decades, 176 open-access journals — and many of the papers published in them — have disappeared from the Internet, according to an analysis published on 27 August<sup><a href="#ref-CR1" data-track="click" data-action="anchor-link" data-track-label="go to reference" data-track-category="references">1</a></sup>.</p><p>“There shouldn’t really be any decay or loss in scientific publications, particularly those that have been open on the web,†says Mikael Laakso, an information scientist at the Hanken School of Economics in Helsinki, and a co-author of the study, which was posted on the arXiv preprint server. He and his colleagues identified 176 titles whose online presence vanished between 2000 and 2019.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"><h1 class="recommended__title serif">Investigating journals: The dark side of publishing</h1></a>
+ </aside></p><p>More than half of these journals were in the social sciences and humanities, although life sciences, health sciences, physical sciences and mathematics were also represented. Eighty-eight of the journals were affiliated with a scholarly society or a research institution. The analysis also identified 900 journals that are still online but seem to have stopped publishing papers, so might be vulnerable to vanishing in the near future.</p><p>The study lays out a "compelling case" for the vulnerability of online journals, says Elizabeth Lightfoot, a librarian at Florida International University in Miami.</p><h2>Vanishing journals</h2><p>Journals can disappear from the Internet for a number of reasons, says Laakso. The publisher might stop paying to keep its publication’s webpage afloat, for example, or journals might be hosted on an online platform that belongs to an academic institution and is left behind when the site or server is updated.</p><p>Journals are supposed to be preserved in digital archives when this happens. Services such as the LOCKSS (Lots of Copies Keep Stuff Safe) Program, which was launched by Stanford Libraries in 1999, aim to ensure that publications remain available even when the publisher is no longer around. LOCKSS works by making multiple copies of content that is stored on the servers of participating libraries, who pay an annual fee to have their collections preserved. Similar initiatives, including CLOCKSS, Portico and the Public Knowledge Project’s Preservation Network (PKP PN), have emerged over the past two decades. These vary in cost and coverage: Some work with libraries, others with publishers — services such as PKP PN are free for journals that sign up. Tens of thousands of titles are currently curated in such preservation schemes. But, Laakso says, there are dozens of journals that fall through the cracks.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"><h1 class="recommended__title serif">Radical open-access plan could spell end to journal subscriptions</h1></a>
+ </aside></p><p>Pinning down whether a journal is truly unavailable online is a challenge, because there is no single database that tracks the activity of open-access journals, says Lisa Matthias, one of the authors of the study and a PhD student at the Free University of Berlin. Databases such as the Directory of Open Access Journals (DOAJ) don’t keep track of journals that no longer publish — and journals that cease publishing or stop maintaining their presence on the web usually do so silently.</p><p>To find out how many journals had vanished, the team manually collected historical data from several lists of titles, including the DOAJ, Ulrichsweb and Scopus. Then they checked to see if any of the titles they identified were listed on the Keepers Registry, which keeps track of journals that are enrolled into digital preservation schemes. Finally, they went to the Internet Archive’s Wayback Machine to access snapshots of now-offline journals’ websites to see when they had last published, and when the content was last available on the Internet. Journals were considered “vanished†if less than 50% of their content was still freely available online (the researchers acknowledge that some journals could exist in print form or behind a paywall).</p><p>The majority of the 176 vanished journals had disappeared within 5 years of becoming inactive — the point at which they stopped publishing papers. Around one-third of them disappeared within one year of the last publication. The researchers used this ‘life cycle’ to estimate that another 900 inactive open-access journalscould be at risk of vanishing.</p><h2>Preserving the literature</h2><p>Subscription journals were not included in the study, Laakso says, because paywalls mean that they would have had to have used a different method to collect the data. He adds that because of this and other limitations, the study probably underestimates the number of journals that have disappeared. “It’s really hard to pin down when something doesn't absolutely exist, but we tried our best,†Laakso says. “We hope that there will be more refined and automatic ways to detect these in the future.â€</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-019-02038-0" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16870448.jpg"><h1 class="recommended__title serif">India culls hundreds more ‘dubious’ journals from government approved list</h1></a>
+ </aside></p><p>Thib Guicherd-Callin, the acting manager of the LOCKSS Program, says it’s not surprising that there are journals that aren't captured by existing preservation services. Although many groups have used the open-source LOCKSS software, efforts to launch digital preservation initiatives are still “woefully underfundedâ€, he adds. “The desire to preserve these at-risk works is there,†he adds, but few institutions are investing the resources necessary to identify these publications and make sure they’re included in a digital preservation scheme.</p><p>Matthias says that the responsibility for ensuring inactive journals don’t disappear should be shared between publishers, authors, librarians and preservation services. Lightfoot agrees that a coordinated and collaborative effort is necessary. However, she adds, “the twin challenges of what that effort might look like and who would fund it make the pathway forward murky at bestâ€.</p>
+ </div>
+
+ <div class="emphasis">doi: <a href="https://doi.org/10.1038/d41586-020-02610-z">https://doi.org/10.1038/d41586-020-02610-z</a></div>
+ <div class="anchor-link mt40" data-toggle="anchor-links"></div>
+ <div id="references" class="references" data-toggle="anchor-links-section" data-label="References" data-concertina="true">
+ <section aria-labelledby="Bib1"><div class="serif article-section js-article-section cleared clear" id="Bib1-section"><h2 class="js-section-title section-title strong position-relative tighten-line-height background-gray-light pt20 pb6 pl0 pr20 standard-space-below small-space-above mq640-pt10 mq640-pb10 mq640-pl20 mq640-mt0 mq640-ml-20 mq640-mr-20 extend-left" id="Bib1">References</h2><div class="pl20 mq875-pl0 js-collapsible-section" id="Bib1-content"><div data-container-section="references"><ol class="clean-list ma0 standard-space-below indented-list" data-test="references-list"><li class="small-space-below border-gray-medium border-bottom-1 position-relative js-ref-item" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Article" data-test="citation"><span class="indented-counter serif h2 tighten-line-height text-right position-absolute grade-c-hide">1.</span><p class="tiny-space-below" id="ref-CR1">Laakso, M., Matthias, L. &amp; Jahn, N. Preprint at <a href="https://arxiv.org/abs/2008.11933">https://arxiv.org/abs/2008.11933</a> (2020).</p><ul class="js-ref-links clean-list cleared strong sans-serif text13 hide-print small-space-below"><li class="pin-right"><ul class="clean-list ma0"></ul></li></ul></li></ol><p class="hide-print text-right"><a href="/articles/d41586-020-02610-z-references.ris" class="text14 sans-serif strong" data-track="click" data-track-action="download citation references" data-track-label="link">Download references</a></p></div></div></div></section>
+ </div>
+
+
+
+
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="inPage box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-inPage-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-inPage">
+ <input id="briefing-box-signup-form-inPage-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-inPage-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-inPage-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-inPage-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-inPage-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+
+
+ </div>
+
+ <aside class="article__aside align-right">
+ <div class="related-content shrink--aside hide-print">
+
+ <h3 class="aside__title sans-serif">Related Articles</h3>
+ <ul class="ma0 clean-list">
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click"
+ data-track-label="related article (rank:0)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ </noscript>
+
+ Radical open-access plan could spell end to journal subscriptions
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click"
+ data-track-label="related article (rank:1)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ </noscript>
+
+ Investigating journals: The dark side of publishing
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-020-01066-5" data-track="click"
+ data-track-label="related article (rank:2)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ </noscript>
+
+ Nature to join open-access Plan S, publisher says
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07557-w" data-track="click"
+ data-track-label="related article (rank:3)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ </noscript>
+
+ Funders flesh out details of Europe’s bold open-access plan
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07245-9" data-track="click"
+ data-track-label="related article (rank:4)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ </noscript>
+
+ AI peer reviewers unleashed to ease publishing grind
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/open-access-the-true-cost-of-science-publishing-1.12676" data-track="click"
+ data-track-label="related article (rank:5)">
+
+ The true cost of science publishing
+ </a>
+ </h3>
+ </li>
+
+ </ul>
+ </div>
+
+ <div class="article__subjects bordered-container shrink--aside hide-print">
+ <h3 class="aside__title sans-serif">Subjects</h3>
+ <ul class="ma0 subject-list cleared clean-list inline-list">
+
+ <li class="subject"><a href="/subjects/publishing" data-track="click"
+ data-track-label="subject (rank:0)">Publishing</a>
+ </li>
+
+ </ul>
+ </div>
+
+
+
+<div id="div-gpt-ad-right-2"
+ class="div-gpt-ad medium-rectangle advert js-ad text-center hide-print grade-c-hide"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="300x250"
+ data-gpt-targeting="pos=right;artid=/articles/d41586-020-02610-z;path=/articles/d41586-020-02610-z"
+ data-ad-type="right"
+ >
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z"
+ alt="Advertisement"
+ width="300"
+ height="250"/>
+ </a>
+ </noscript>
+</div>
+
+
+ <div class="nature-briefing--sidebar bordered-container shrink--aside hide-print">
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="sidebar box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Sign up to Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-sidebar-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-sidebar">
+ <input id="briefing-box-signup-form-sidebar-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-sidebar-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-sidebar-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-sidebar-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-sidebar-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+</div>
+
+ </aside>
+ </div>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="publisher" itemtype="https://schema.org/Organization">
+ <meta content="Macmillan Publishers Limited, part of Springer Nature" itemprop="name"/>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="author" itemtype="https://schema.org/Organization">
+ <meta content="Nature Editorial" itemprop="name"/>
+ </div>
+ <img src="/platform/track/article/d41586-020-02610-z" width="1" height="1" alt="" class="visually-hidden"/>
+</article>
+
+
+
+
+
+
+
+<div class="c-site-messages message hide u-hide-print c-site-messages--nature-briefing c-site-messages--nature-briefing-email-variant c-site-messages--nature-briefing-redesign-2020 sans-serif"
+data-component-id="nature-briefing-banner"
+data-component-expirydays="30"
+data-component-trigger-scroll-percentage="15"
+data-track="in-view"
+data-track-action="in-view"
+data-track-category="nature briefing"
+data-track-label="redesign banner visible">
+
+
+ <div class="c-site-messages__banner-large">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__form-container">
+
+
+
+ <div class="grid grid-12 last">
+ <div class="grid grid-4">
+ <img alt="Nature Briefing" src="/static/images/logos/nature-briefing-logo-n150-white.d81c9da3ec.svg" width="250" height="40">
+ <p class="c-site-messages--nature-briefing__strapline extra-tight-line-height">Sign up for the <em>Nature Briefing</em> newsletter — what matters in science, free to your inbox daily.</p>
+ </div>
+ <div class="grid grid-8 last">
+ <form action="/briefing/signup/formfeedback" method="post" data-location="banner" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-banner-signup-form-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBannerRedesign2020">
+ <input id="briefing-banner-signup-form-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBanner">
+ <label class="nature-briefing-banner__email-label" for="banner-EmailAddressInput">Email address</label>
+
+ <div class="nature-briefing-banner__email-wrapper">
+ <input class="nature-briefing-banner__email-input box-sizing text14" type="email" id="banner-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-emailbanner-email-input">
+ <button type="submit" class="nature-briefing-banner__submit-button box-sizing text14" data-test-element="briefing-emailbanner-signup-button">Sign up</button>
+ </div>
+
+ <div class="nature-briefing-banner__checkbox-wrapper grid grid-12 last">
+ <input class="nature-briefing-banner__checkbox-checkbox" id="gdpr-briefing-banner-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-emailbanner-gdpr-checkbox" required>
+ <label class="nature-briefing-banner__checkbox-label box-sizing text13 sans-serif block tighten-line-height" for="gdpr-briefing-banner-checkbox">I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+ </form>
+ </div>
+ </div>
+
+
+ </div>
+
+ </div>
+
+
+ <div class="c-site-messages__banner-small">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__content text14">
+ <span class="c-site-messages--nature-briefing__strapline strong serif">Get the most important science stories of the day, free in your inbox.</span>
+ <a class="nature-briefing__link text14 sans-serif"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner CTA to site"
+ data-test-element="briefing-banner-link"
+ target="_blank"
+ rel="noreferrer noopener"
+ href="/briefing/signup/?origin=Nature&amp;originReferralPoint=EmailBanner">Sign up for Nature Briefing
+ </a>
+ </div>
+
+ </div>
+
+</div>
+
+ </section>
+</div>
+ <script>
+ window.onload = function () {
+ Array.prototype.slice.call(document.querySelectorAll(".magazine-infographic > iframe"))
+ .forEach(function (element) {
+ function listener(event) {
+ if (event.data.height) {
+ if (element.id === event.data.requestData.id) {
+ element.setAttribute("height", event.data.height)
+ }
+ }
+ }
+
+ window.addEventListener("message", listener);
+ element.contentWindow.postMessage({name: "getHeight", id: element.id}, "*");
+ });
+ }
+ </script>
+ <script>
+ var linkEl = document.querySelector('.js-ctm');
+ if (linkEl && window.matchMedia && window.matchMedia(linkEl.media).matches) {
+ var fragment = document.createDocumentFragment();
+ var polyfillScript = document.createElement('script');
+ var header150Script = null;
+ var appScript = document.createElement('script');
+ var sharedEs6Script = document.createElement('script');
+
+ polyfillScript.src = 'https://cdn.polyfill.io/v2/polyfill.min.js?features=default,IntersectionObserver,Array.prototype.includes,Promise';
+ polyfillScript.async = false;
+ fragment.appendChild(polyfillScript);
+
+ appScript.src = '/static/js/magazine/magazine-mosaic.71d8740808.js';
+ appScript.async = false;
+ fragment.appendChild(appScript);
+
+ sharedEs6Script.src = '/static/js/shared-es6-bundle.c83ed51f05.js';
+ sharedEs6Script.async = false;
+ fragment.appendChild(sharedEs6Script);
+
+ header150Script = document.createElement('script');
+ header150Script.src = '/static/js/header-150-bundle.aaea96385f.js';
+ header150Script.async = false;
+ fragment.appendChild(header150Script);
+
+ document.body.appendChild(fragment);
+ }
+ </script>
+ <script>
+ var idp = {
+ hasNatureUserProof: function (hasProof) {
+ if (!hasProof) {
+ document.getElementById("my-account").setAttribute("style", "display: none;");
+ document.getElementById("login-button").setAttribute("style", "");
+ }
+ }
+ }
+ </script>
+ <script src="https://verify.nature.com/verify/nature.min.js"></script>
+ <noscript>
+ <img src="https://verify.nature.com/verify/nature.png" alt="" width="0" height="0"/>
+ </noscript>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Explore-our-content" data-test="Explore-our-content" id="explore" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Explore-our-content" class="c-header-expander__heading u-js-hide">Explore our content</h2>
+ <ul class="c-header-expander__list">
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/research"
+ data-track="click"
+ data-track-action="research"
+ data-track-label="link">
+ Research
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/news"
+ data-track="click"
+ data-track-action="news"
+ data-track-label="link">
+ News
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/opinion"
+ data-track="click"
+ data-track-action="opinion"
+ data-track-label="link">
+ Opinion
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/research-analysis"
+ data-track="click"
+ data-track-action="research analysis"
+ data-track-label="link">
+ Research Analysis
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/careers"
+ data-track="click"
+ data-track-action="careers"
+ data-track-label="link">
+ Careers
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/books-culture"
+ data-track="click"
+ data-track-action="books and culture"
+ data-track-label="link">
+ Books and Culture
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/podcast"
+ data-track="click"
+ data-track-action="podcasts"
+ data-track-label="link">
+ Podcasts
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/videoarchive"
+ data-track="click"
+ data-track-action="videos"
+ data-track-label="link">
+ Videos
+ </a>
+ </li>
+
+
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/current-issue"
+ data-track="click"
+ data-track-action="current issue"
+ data-track-label="link">
+ Current Issue
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-issues"
+ data-track="click"
+ data-track-action="browse issues"
+ data-track-label="link">
+ Browse Issues
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/articles"
+ data-track="click"
+ data-track-action="browse articles"
+ data-track-label="link">
+ Browse Articles
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/collections"
+ data-track="click"
+ data-track-action="browse collections"
+ data-track-label="link">
+ Browse Collections
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-subjects"
+ data-track="click"
+ data-track-action="browse subjects"
+ data-track-label="link">
+ Browse Subjects
+ </a>
+ </li>
+
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="https://www.nature.com/my-account/alerts/subscribe-journal?list-id&#x3D;1"
+ data-track="click"
+ data-track-action="Sign up for alerts"
+ data-track-label="link">Sign up for alerts<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m4 10h2.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-3.08578644l-1.12132034 1.1213203c-.18753638.1875364-.29289322.4418903-.29289322.7071068v.1715729h14v-.1715729c0-.2652165-.1053568-.5195704-.2928932-.7071068l-1.7071068-1.7071067v-3.4142136c0-2.76142375-2.2385763-5-5-5-2.76142375 0-5 2.23857625-5 5zm3 4c0 1.1045695.8954305 2 2 2s2-.8954305 2-2zm-5 0c-.55228475 0-1-.4477153-1-1v-.1715729c0-.530433.21071368-1.0391408.58578644-1.4142135l1.41421356-1.4142136v-3c0-3.3137085 2.6862915-6 6-6s6 2.6862915 6 6v3l1.4142136 1.4142136c.3750727.3750727.5857864.8837805.5857864 1.4142135v.1715729c0 .5522847-.4477153 1-1 1h-4c0 1.6568542-1.3431458 3-3 3-1.65685425 0-3-1.3431458-3-3z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Journal-information" id="journal-info" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Journal-information" class="c-header-expander__heading u-js-hide">Journal information</h2>
+ <ul class="c-header-expander__list">
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/about"
+ data-track="click"
+ data-track-action="about the journal"
+ data-track-label="link">
+ About the Journal
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-authors"
+ data-track="click"
+ data-track-action="for authors"
+ data-track-label="link">
+ For Authors
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-referees"
+ data-track="click"
+ data-track-action="for referees"
+ data-track-label="link">
+ For Referees
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/awards"
+ data-track="click"
+ data-track-action="awards"
+ data-track-label="link">
+ Awards
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-label="link">
+ Subscribe
+ </a>
+ </li>
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="http://mts-nature.nature.com/"
+ data-track="click"
+ data-track-action="Submit manuscript"
+ data-track-label="link">Submit manuscript<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m15 0c1.1045695 0 2 .8954305 2 2v5.5c0 .27614237-.2238576.5-.5.5s-.5-.22385763-.5-.5v-5.5c0-.51283584-.3860402-.93550716-.8833789-.99327227l-.1166211-.00672773h-9v3c0 1.1045695-.8954305 2-2 2h-3v10c0 .5128358.38604019.9355072.88337887.9932723l.11662113.0067277h7.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-7.5c-1.1045695 0-2-.8954305-2-2v-10.17157288c0-.53043297.21071368-1.0391408.58578644-1.41421356l3.82842712-3.82842712c.37507276-.37507276.88378059-.58578644 1.41421356-.58578644zm-.5442863 8.18867991 3.3545404 3.35454039c.2508994.2508994.2538696.6596433.0035959.909917-.2429543.2429542-.6561449.2462671-.9065387-.0089489l-2.2609825-2.3045251.0010427 7.2231989c0 .3569916-.2898381.6371378-.6473715.6371378-.3470771 0-.6473715-.2852563-.6473715-.6371378l-.0010428-7.2231995-2.2611222 2.3046654c-.2531661.2580415-.6562868.2592444-.9065605.0089707-.24295423-.2429542-.24865597-.6576651.0036132-.9099343l3.3546673-3.35466731c.2509089-.25090888.6612706-.25227691.9135302-.00001728zm-.9557137-3.18867991c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5zm-8.5-3.587-3.587 3.587h2.587c.55228475 0 1-.44771525 1-1zm8.5 1.587c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+
+
+ <div id="search-menu" class="c-header-expander c-header-expander--tray u-hide-print" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <h2 class="u-visually-hidden">Search</h2>
+ <div data-test="inline-search">
+ <div class="c-header-expander__keyline u-mb-16">
+ <form action="/search"
+ method="get"
+ role="search"
+ class="c-header-expander__form"
+ autocomplete="off"
+ data-dynamic-track-label
+ data-track="submit" data-track-action="search" data-track-label="form">
+ <label class="c-header-expander__heading" for="keywords">Article Search</label>
+ <div class="c-form-field u-display-flex">
+ <input type="text"
+ class="c-form-field__input u-flex-shrink"
+ id="keywords"
+ name="q"
+ value=""
+ placeholder="Search by keywords or author"
+ data-test="search-keywords">
+ <button type="submit" class="c-button c-button--contrast u-flex-static u-ml-8" data-test="search-submit">Search</button>
+ </div>
+ <p class="u-ma-0">
+ <a href="/search/advanced"
+ data-track="click" data-track-action="advanced search" data-track-label="link">
+ Advanced search
+ </a>
+ </p>
+ </form>
+ </div>
+ <div class="c-header-expander__keyline">
+ <h3 class="c-header-expander__heading">Quick links</h3>
+ <ul class="u-list-reset">
+ <li class="u-display-inline-block u-mr-24"><a href="/subjects" data-track="click" data-track-action="explore articles by subject" data-track-label="link">Explore articles by subject</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/naturecareers" data-track="click" data-track-action="find a job" data-track-label="link">Find a job</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to authors</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+
+
+
+
+<footer role="contentinfo" class="composite-layer">
+ <div class="u-mt-16 u-mb-16">
+ <div class="u-container">
+ <div class="u-display-flex u-flex-wrap u-justify-content-space-between">
+ <p class="c-meta u-ma-0 u-mr-24">
+
+</p>
+
+ <p class="c-meta u-ma-0">
+ <span aria-level="2" class="c-meta__item" itemprop="name">
+ Nature
+ </span>
+ <span class="c-meta__item">
+ <abbr title="International Standard Serial Number">ISSN</abbr> <span itemprop="issn">1476-4687</span> (online)
+ </span>
+ </p>
+ </div>
+ </div>
+</div>
+
+
+ <div itemscope itemtype="http://schema.org/Periodical">
+ <meta itemprop="publisher" content="Springer Nature">
+ <div class="c-footer">
+ <div class="u-container">
+ <div class="u-hide-print" data-track-component="footer">
+ <h2 aria-level="2" class="u-visually-hidden">nature.com sitemap</h2>
+ <div class="c-footer__header">
+ <div class="c-footer__logo">
+ <img alt="Nature Research" src="/static/images/logos/nature research-white-150.f4acf77e0c.svg" loading="lazy" width="200" height="26">
+ </div>
+ <ul class="c-menu c-menu--inherit u-mr-32">
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/company_info/index.html" data-track="click" data-track-action="about us" data-track-label="link">About us</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/press_room/press_releases.html" data-track="click" data-track-action="press releases" data-track-label="link">Press releases</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://press.nature.com/" data-track="click" data-track-action="press office" data-track-label="link">Press office</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://support.nature.com/support/home" data-track="click" data-track-action="contact us" data-track-label="link">Contact us</a></li>
+ </ul>
+ <ul class="c-menu c-menu--inherit">
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.facebook.com/nature/" aria-label="Nature on Facebook" data-track="click" data-track-action="facebook" data-track-label="link">
+ <svg class="u-icon u-mt-2 u-mb-2" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 20 20"><path d="M2.5 20C1.1 20 0 18.9 0 17.5v-15C0 1.1 1.1 0 2.5 0h15C18.9 0 20 1.1 20 2.5v15c0 1.4-1.1 2.5-2.5 2.5h-3.7v-7.7h2.6l.4-3h-3v-2c0-.9.2-1.5 1.5-1.5h1.6V3.1c-.3 0-1.2-.1-2.3-.1-2.3 0-3.9 1.4-3.9 4v2.2H8.1v3h2.6V20H2.5z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://twitter.com/nresearchnews?lang=en" aria-label="Nature on Twitter" data-track="click" data-track-action="twitter" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M17.6 4.1c.8-.5 1.5-1.4 1.8-2.4-.8.5-1.7.9-2.6 1-.7-.8-1.8-1.4-3-1.4-2.3 0-4.1 1.9-4.1 4.3 0 .3 0 .7.1 1-3.4 0-6.4-1.8-8.4-4.4C1 2.9.8 3.6.8 4.4c0 1.5.7 2.8 1.8 3.6C2 8 1.4 7.8.8 7.5v.1c0 2.1 1.4 3.8 3.3 4.2-.3.1-.7.2-1.1.2-.3 0-.5 0-.8-.1.5 1.7 2 3 3.8 3-1.3 1.1-3.1 1.8-5 1.8-.3 0-.7 0-1-.1 1.8 1.2 4 1.9 6.3 1.9C13.8 18.6 18 12 18 6.3v-.6c.8-.6 1.5-1.4 2-2.2-.7.3-1.5.5-2.4.6z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.youtube.com/channel/UCvCLdSgYdSTpWcOgEJgi-ng" aria-label="Nature on YouTube" data-track="click" data-track-action="youtube" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M7.9 12.6V6.9l5.4 2.8c0 .1-5.4 2.9-5.4 2.9zM19.8 6s-.2-1.4-.8-2c-.8-.8-1.6-.8-2-.9-2.8-.2-7-.2-7-.2s-4.2 0-7 .2c-.4 0-1.2 0-2 .9-.6.6-.8 2-.8 2S0 7.6 0 9.2v1.5c0 1.7.2 3.3.2 3.3s.2 1.4.8 2c.8.8 1.8.8 2.2.9 1.6.1 6.8.2 6.8.2s4.2 0 7-.2c.4 0 1.2-.1 2-.9.6-.6.8-2 .8-2s.2-1.6.2-3.3V9.2c0-1.6-.2-3.2-.2-3.2z"/></svg>
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="c-footer__grid">
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Discover content</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/siteindex" data-track="click" data-track-action="journals a-z" data-track-label="link">Journals A-Z</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/subjects/" data-track="click" data-track-action="article by subject" data-track-label="link">Articles by subject</a></li>
+ <li class="c-footer__item"><a href="https://nano.nature.com/" data-track="click" data-track-action="nano" data-track-label="link">Nano</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/protocolexchange/" data-track="click" data-track-action="protocol exchange" data-track-label="link">Protocol Exchange</a></li>
+ <li class="c-footer__item"><a href="https://www.natureindex.com/" data-track="click" data-track-action="nature index" data-track-label="link">Nature Index</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Publish with us</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/author_resources/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to Authors</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/peer_review/" data-track="click" data-track-action="guide to referees" data-track-label="link">Guide to Referees</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/publishing-with-npg/" data-track="click" data-track-action="open access" data-track-label="link">Open access</a></li>
+ <li ><a href="https://www.nature.com/reprints/" data-track="click" data-track-action="reprints and permissions" data-track-label="link">Reprints &amp; permissions</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Researcher services</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/authors/research-data" data-track="click" data-track-action="data research service" data-track-label="link">Research data</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/go/nr" data-track="click" data-track-action="language editing" data-track-label="link">Language editing</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/scientific-editing/" data-track="click" data-track-action="scientific editing" data-track-label="link">Scientific editing</a></li>
+ <li class="c-footer__item"><a href="https://masterclasses.nature.com/" data-track="click" data-track-action="nature masterclasses" data-track-label="link">Nature Masterclasses</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/researcher-training/" data-track="click" data-track-action="nature research academies" data-track-label="link">Nature Research Academies</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Libraries &amp; institutions</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/tools-services" data-track="click" data-track-action="librarian service and tools" data-track-label="link">Librarian service &amp; tools</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/manage-your-account/librarianportal" data-track="click" data-track-action="librarian portal" data-track-label="link">Librarian portal</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/about-open-access/information-for-institutions/" data-track="click" data-track-action="open research" data-track-label="link">Open research</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Advertising &amp; partnerships</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/digital-advertising/" data-track="click" data-track-action="advertising" data-track-label="link">Advertising</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/" data-track="click" data-track-action="partnerships and services" data-track-label="link">Partnerships &amp; Services</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/media-kits/" data-track="click" data-track-action="media kits" data-track-label="link">Media kits</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/branded-content-native-advertising/" data-track-action="branded content" data-track-label="link">Branded content</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Career development</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/naturecareers" data-track="click" data-track-action="nature careers" data-track-label="link">Nature Careers</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureconferences/" data-track="click" data-track-action="nature conferences" data-track-label="link">Nature<span class="visually-hidden"> </span> Conferences</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureevents/" data-track="click" data-track-action="nature events" data-track-label="link">Nature<span class="visually-hidden"> </span> events</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Regional websites</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="http://www.naturechina.com" data-track="click" data-track-action="nature china" data-track-label="link">Nature China</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nindia" data-track="click" data-track-action="nature india" data-track-label="link">Nature India</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ja-jp/" data-track="click" data-track-action="nature japan" data-track-label="link">Nature Japan</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ko-kr/" data-track="click" data-track-action="nature korea" data-track-label="link">Nature Korea</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nmiddleeast/" data-track="click" data-track-action="nature middle east" data-track-label="link">Nature Middle East</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Legal &amp; Privacy</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/info/privacy.html" data-track="click" data-track-action="privacy policy" data-track-label="link">Privacy Policy</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/cookies.html" data-track="click" data-track-action="use of cookies" data-track-label="link">Use of cookies</a></li>
+ <li class="c-footer__item"><a class="optanon-toggle-display" href="javascript:;" data-track="click" data-track-action="manage cookies" data-track-label="link">Manage cookies/Do not sell my data</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/legal_notice.html" data-track="click" data-track-action="legal notice" data-track-label="link">Legal notice</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/accessibility_statement.html" data-track="click" data-track-action="accessibility statement" data-track-label="link">Accessibility statement</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/tandc.html" data-track="click" data-track-action="terms and conditions" data-track-label="link">Terms &amp; Conditions</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/ccpa" data-track="click" data-track-action="california privacy statement" data-track-label="link">California Privacy Statement</a></li>
+ </ul>
+ </div>
+ </div>
+</div>
+
+
+ </div>
+ </div>
+ </div>
+
+ <div class="c-corporate-footer">
+ <div class="u-container">
+ <img src="/static/images/logos/sn-logo-white.ea63208b81.svg" alt="Springer Nature" loading="lazy" width="140" height="14"/>
+ <p class="c-corporate-footer__legal" data-test="copyright">&copy; 2020 Springer Nature Limited</p>
+ </div>
+</div>
+
+
+ <svg class="u-hide hide">
+ <symbol id="global-icon-chevron-right" viewBox="0 0 16 16">
+ <path d="M7.782 7L5.3 4.518c-.393-.392-.4-1.022-.02-1.403a1.001 1.001 0 011.417 0l4.176 4.177a1.001 1.001 0 010 1.416l-4.176 4.177a.991.991 0 01-1.4.016 1 1 0 01.003-1.42L7.782 9l1.013-.998z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-download" viewBox="0 0 16 16">
+ <path d="M2 14c0-.556.449-1 1.002-1h9.996a.999.999 0 110 2H3.002A1.006 1.006 0 012 14zM9 2v6.8l2.482-2.482c.392-.392 1.022-.4 1.403-.02a1.001 1.001 0 010 1.417l-4.177 4.177a1.001 1.001 0 01-1.416 0L3.115 7.715a.991.991 0 01-.016-1.4 1 1 0 011.42.003L7 8.8V2c0-.55.444-.996 1-.996.552 0 1 .445 1 .996z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-email" viewBox="0 0 18 18">
+ <path d="M1.995 2h14.01A2 2 0 0118 4.006v9.988A2 2 0 0116.005 16H1.995A2 2 0 010 13.994V4.006A2 2 0 011.995 2zM1 13.994A1 1 0 001.995 15h14.01A1 1 0 0017 13.994V4.006A1 1 0 0016.005 3H1.995A1 1 0 001 4.006zM9 11L2 7V5.557l7 4 7-4V7z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-institution" viewBox="0 0 18 18">
+ <path d="M14 8a1 1 0 011 1v6h1.5a.5.5 0 01.5.5v.5h.5a.5.5 0 01.5.5V18H0v-1.5a.5.5 0 01.5-.5H1v-.5a.5.5 0 01.5-.5H3V9a1 1 0 112 0v6h8V9a1 1 0 011-1zM6 8l2 1v4l-2 1zm6 0v6l-2-1V9zM9.573.401l7.036 4.925A.92.92 0 0116.081 7H1.92a.92.92 0 01-.528-1.674L8.427.401a1 1 0 011.146 0zM9 2.441L5.345 5h7.31z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-search" viewBox="0 0 22 22">
+ <path fill-rule="evenodd" d="M21.697 20.261a1.028 1.028 0 01.01 1.448 1.034 1.034 0 01-1.448-.01l-4.267-4.267A9.812 9.811 0 010 9.812a9.812 9.811 0 1117.43 6.182zM9.812 18.222A8.41 8.41 0 109.81 1.403a8.41 8.41 0 000 16.82z"/>
+ </symbol>
+ <symbol id="global-icon-info" viewBox="0 0 18 18">
+ <path d="m9 0c4.9705627 0 9 4.02943725 9 9 0 4.9705627-4.0294373 9-9 9-4.97056275 0-9-4.0294373-9-9 0-4.97056275 4.02943725-9 9-9zm0 7h-1.5l-.11662113.00672773c-.49733868.05776511-.88337887.48043643-.88337887.99327227 0 .47338693.32893365.86994729.77070917.97358929l.1126697.01968298.11662113.00672773h.5v3h-.5l-.11662113.0067277c-.42082504.0488782-.76196299.3590206-.85696816.7639815l-.01968298.1126697-.00672773.1166211.00672773.1166211c.04887817.4208251.35902055.761963.76398144.8569682l.1126697.019683.11662113.0067277h3l.1166211-.0067277c.4973387-.0577651.8833789-.4804365.8833789-.9932723 0-.4733869-.3289337-.8699473-.7707092-.9735893l-.1126697-.019683-.1166211-.0067277h-.5v-4l-.00672773-.11662113c-.04887817-.42082504-.35902055-.76196299-.76398144-.85696816l-.1126697-.01968298zm0-3.25c-.69035594 0-1.25.55964406-1.25 1.25s.55964406 1.25 1.25 1.25 1.25-.55964406 1.25-1.25-.55964406-1.25-1.25-1.25z" fill-rule="evenodd"/>
+ </symbol>
+ </svg>
+
+</footer>
+
+
+</body>
+</html>
+
diff --git a/python/tests/files/peerj_oa_article.html b/python/tests/files/peerj_oa_article.html
new file mode 100644
index 0000000..f2cf365
--- /dev/null
+++ b/python/tests/files/peerj_oa_article.html
@@ -0,0 +1,2365 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+ <meta charset="utf-8">
+
+ <title>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles [PeerJ]</title>
+
+
+ <link rel="dns-prefetch" href="https://d2pdyyx74uypu5.cloudfront.net/">
+ <link rel="dns-prefetch" href="http://static.peerj.com/">
+<link rel="dns-prefetch" href="https://doi.org">
+
+
+ <meta name="citation_title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"><meta name="citation_date" content="2018-02-13"><meta name="citation_doi" content="10.7717/peerj.4375"><meta name="citation_language" content="en"><meta name="citation_pdf_url" content="https://peerj.com/articles/4375.pdf"><meta name="citation_fulltext_html_url" content="https://peerj.com/articles/4375"><meta name="citation_volume" content="6"><meta name="citation_firstpage" content="e4375"><meta name="citation_keywords" content="Open access; Open science; Scientometrics; Publishing; Libraries; Scholarly communication; Bibliometrics; Science policy"><meta name="citation_journal_title" content="PeerJ"><meta name="citation_journal_abbrev" content="PeerJ"><meta name="citation_publisher" content="PeerJ Inc."><meta name="citation_issn" content="2167-8359"><meta name="citation_author" content="Heather Piwowar"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="heather@impactstory.org"><meta name="citation_author" content="Jason Priem"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="jason@impactstory.org"><meta name="citation_author" content="Vincent Larivière"><meta name="citation_author_institution" content="École de bibliothéconomie et des sciences de l’information, Université de Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author" content="Juan Pablo Alperin"><meta name="citation_author_institution" content="Canadian Institute for Studies in Publishing, Simon Fraser University, Vancouver, BC, Canada"><meta name="citation_author_institution" content="Public Knowledge Project, Canada"><meta name="citation_author" content="Lisa Matthias"><meta name="citation_author_institution" content="Scholarly Communications Lab, Simon Fraser University, Vancouver, Canada"><meta name="citation_author" content="Bree Norlander"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Ashley Farley"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Jevin West"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author" content="Stefanie Haustein"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="School of Information Studies, University of Ottawa, Ottawa, ON, Canada">
+ <meta name="description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+
+ <meta property="og:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+ <meta name="twitter:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:url" content="https://peerj.com/articles/4375">
+ <meta name="twitter:site" content="@thePeerJ">
+ <meta name="twitter:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta name="twitter:description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+ <meta property="og:type" content="article">
+ <meta property="og:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta property="og:url" content="https://peerj.com/articles/4375">
+ <meta property="og:site_name" content="PeerJ">
+
+
+ <link rel="alternate" type="application/pdf" href="/articles/4375.pdf">
+ <link rel="alternate" type="application/rdf+xml" href="/articles/4375.rdf">
+ <link rel="alternate" type="application/json" href="/articles/4375.json">
+ <link rel="alternate" type="application/xml" href="/articles/4375.xml">
+ <link rel="alternate" type="application/unixref+xml" href="/articles/4375.unixref">
+ <link rel="alternate" type="application/vnd.citationstyles.csl+json" href="/articles/4375.citeproc">
+ <link rel="alternate" type="application/bibjson+json" href="/articles/4375.bibjson">
+ <link rel="alternate" type="text/html" href="/articles/4375.html">
+
+ <link rel="canonical" href="https://peerj.com/articles/4375/">
+
+ <meta name="viewport" content="width=device-width,initial-scale=1">
+ <meta property="fb:app_id" content="534542813234464">
+
+ <link rel="stylesheet" href="/css/05b9c3d-27443c7.css" media="screen">
+
+<!--[if lt IE 9]>
+ <link rel="stylesheet" href="/assets/css/ie8.css" media="screen">
+<![endif]-->
+
+<!--[if lt IE 10]>
+ <link rel="stylesheet" href="/assets/css/ie9.css" media="screen">
+<![endif]-->
+
+ <style media="screen">html, body { height: 100%; }</style>
+ <link rel="stylesheet" href="https://cdn.peerj.com/webpack/vue-bundle.2cdd25e1.css">
+
+
+ <link rel="stylesheet" href="/css/a0c1a2c-04690d8.css" media="screen">
+
+ <link rel="stylesheet" href="/css/be477b9-1134171.css" media="screen">
+ <link rel="stylesheet" href="/css/3e4ba6d-c134b5f.css" media="print">
+ <script src="/js/36e5d51-2d7025c.js"></script>
+<script src="/assets/js/polyfills/includes.js"></script>
+<script src="/assets/js/polyfills/startsWith.js"></script><!--[if lt IE 9]>
+<script src="/assets/js/html5shiv.js"></script>
+
+<![endif]-->
+
+<!--[if lt IE 8]>
+<script src="/assets/js/json2.js"></script>
+<![endif]-->
+
+<script>
+ var PeerJ = {
+ Article: {},
+ User: {
+ anonymous: true },
+ Publication: {},
+ Production: {},
+ Event: {},
+ Com: {},
+ Payment: {},
+ Annotation: {},
+ Search: {},
+ Home: {},
+ Subjects: {},
+ Advocacy: {},
+ Job: {},
+ ContentAlert: {},
+ Tools: {}
+ };
+</script>
+
+
+<script>
+ var campaign_keywords = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term'];
+ var kw = '';
+ var lastUtms = {};
+ var firstUtms = {};
+ var allUtms = {};
+
+ function campaignParams() {
+ var index;
+ for (index = 0; index < campaign_keywords.length; ++index) {
+ kw = getQueryParam(document.URL, campaign_keywords[index]);
+ if (kw.length) {
+ lastUtms[campaign_keywords[index] + '-last'] = kw;
+ firstUtms[campaign_keywords[index] + '-first'] = kw;
+ allUtms[campaign_keywords[index] + '-all'] = kw;
+ }
+ }
+ }
+
+ function updatePreregCookie(preregCookie, firstUtmKey) {
+ var utmVal = firstUtms[firstUtmKey];
+ if (utmVal) {
+ var existingPreregCampaign = $.cookie(preregCookie);
+ var appendPreregCampaign;
+ if (!existingPreregCampaign) {
+ appendPreregCampaign = utmVal;
+ } else {
+ appendPreregCampaign = existingPreregCampaign + ',' + utmVal;
+
+ }
+ $.cookie(preregCookie, appendPreregCampaign, {expires: 365, path: "/"});
+ }
+ }
+
+ function getQueryParam(url, param) {
+ // Expects a raw URL
+ param = param.replace(/[[]/, "\[").replace(/[]]/, "\]");
+ var regexS = "[\?&]" + param + "=([^&#]*)",
+ regex = new RegExp( regexS ),
+ results = regex.exec(url);
+ if (results === null || (results && typeof(results[1]) !== 'string' && results[1].length)) {
+ return '';
+ } else {
+ return decodeURIComponent(results[1]).replace(/\W/gi, ' ');
+ }
+ }
+
+ function articlePageEvent() {
+ var articleContainer = $('.publication-jsondata');
+ if (articleContainer.length) {
+ var data = articleContainer.data('publication-meta');
+
+ // Must be public
+ if (data.publicationSubjects.length) {
+
+ var eventName = 'Viewed-article';
+ var preprint = data.preprint;
+ if (preprint) {
+ eventName = 'Viewed-preprint';
+ }
+
+ data['ip-hash'] = 'bf3914b8088a79fb1fcf39cb526631c0';
+ mixpanel.track(eventName, data);
+ }
+ }
+ }
+
+ function sectionListViewEvent() {
+ }
+</script>
+ <script>
+ // User agrees to terms on signup, so Mixpanel is OK
+ // On submit, update mixpanel distinct id
+ setTimeout(function () {
+ var regmixpanel = document.getElementById('fos_user_registration_form_mixpanelId');
+ if (regmixpanel) {
+ var distinctId = $.cookie('pj_mp_distinct');
+ if (!distinctId) {
+ distinctId = mixpanel.get_distinct_id();
+ }
+ console.log(distinctId);
+ regmixpanel.value = distinctId;
+ }
+ }, 1500);
+
+ // If logged out then check if consented to analytics cookies (if applicable to country)
+ // Run through cookieConsent only
+ PeerJ.Com.Mixpanel = new function() {
+ this.leadView = function() {
+ mixpanel.init('776a79e14e8f05a81ca92536c83f08b4', {
+ 'secure_cookie': true,
+ loaded: function (mixpanel) {
+ setTimeout(function () {
+ articlePageEvent();
+
+ sectionListViewEvent();
+
+
+
+ }, 1000);
+ }
+ });
+ }
+ };
+
+ campaignParams();
+ updatePreregCookie('pj_prereg_campaign', 'utm_campaign-first');
+ updatePreregCookie('pj_prereg_content', 'utm_content-first');
+ updatePreregCookie('pj_prereg_term', 'utm_term-first');
+ </script>
+
+
+
+ <script>(function(p,u,s,h,x){p.pushpad=p.pushpad||function(){(p.pushpad.q=p.pushpad.q||[]).push(arguments)};h=u.getElementsByTagName('head')[0];x=u.createElement('script');x.async=1;x.src=s;h.appendChild(x);})(window,document,'https://pushpad.xyz/pushpad.js');
+pushpad('init', 5977, {hostname: 'peerj.com'});
+</script>
+
+ <link rel="search" type="application/opensearchdescription+xml" href="https://peerj.com/articles/osd.xml" title="PeerJ">
+
+
+
+
+
+ <script>
+ // Run through cookieConsent only
+ PeerJ.Com.GA = new function() {
+ this.disabletracking = function() {
+ window['ga-disable-' + 'UA-31208920-1'] = true;
+ };
+
+ this.runGA = function() {
+ (function (i, s, o, g, r, a, m) {
+ i['GoogleAnalyticsObject'] = r;
+ i[r] = i[r] || function () {
+ (i[r].q = i[r].q || []).push(arguments)
+ }, i[r].l = 1 * new Date();
+ a = s.createElement(o),
+ m = s.getElementsByTagName(o)[0];
+ a.async = 1;
+ a.src = g;
+ m.parentNode.insertBefore(a, m)
+ })(window, document, 'script', 'https://www.google-analytics.com/analytics.js', 'ga');
+
+ ga('create', 'UA\u002D31208920\u002D1', 'auto');
+
+ // Removes last octet
+ ga('set', 'anonymizeIp', true);
+
+
+
+
+
+
+
+
+
+ ga('set', 'dimension4', ';Legal\u0020Issues\u003BScience\u0020Policy\u003BData\u0020Science;');
+
+ ga('require', 'displayfeatures');
+
+ ga('send', 'pageview');
+
+ window.setTimeout(function () {
+ ga('send', 'event', 'adjusted bounce rate', 'page visit 15 seconds or more');
+ }, 15000);
+
+
+ }
+ };
+ </script>
+ <script src="/js/8548491-f0f5b7c.js"></script>
+
+<link rel="apple-touch-icon" sizes="57x57" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-57x57.png">
+<link rel="apple-touch-icon" sizes="60x60" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-60x60.png">
+<link rel="apple-touch-icon" sizes="72x72" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-72x72.png">
+<link rel="apple-touch-icon" sizes="76x76" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-76x76.png">
+<link rel="apple-touch-icon" sizes="114x114" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-114x114.png">
+<link rel="apple-touch-icon" sizes="120x120" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-120x120.png">
+<link rel="apple-touch-icon" sizes="144x144" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-144x144.png">
+<link rel="apple-touch-icon" sizes="152x152" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-152x152.png">
+<link rel="apple-touch-icon" sizes="180x180" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-180x180.png">
+<link rel="icon" type="image/png" sizes="192x192" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/android-icon-192x192.png">
+<link rel="shortcut icon" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon.ico">
+<link rel="icon" type="image/png" sizes="32x32" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-32x32.png">
+<link rel="icon" type="image/png" sizes="96x96" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-96x96.png">
+<link rel="icon" type="image/png" sizes="16x16" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-16x16.png">
+<link rel="manifest" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/manifest.json">
+<meta name="msapplication-TileColor" content="#ffffff">
+<meta name="msapplication-TileImage" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/ms-icon-144x144.png">
+<meta name="msapplication-config" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/browserconfig.xml">
+<meta name="theme-color" content="#ffffff"></head>
+
+<body class="">
+
+ <!-- FreshDesk variable (TODO: move elsewhere) -->
+
+
+<nav class="navbar navbar-fixed-top navbar-inverse navbar-alpha" role="navigation"><div class="navbar-inner"><!-- .btn-navbar is used as the toggle for collapsed navbar content --><a class="btn btn-navbar pull-right" data-toggle="collapse" data-target=".nav-collapse"><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></a><!-- logo --><ul class="nav pull-left nav-sections nav-journal"><li class="dropdown"><a href="/" class="dropdown-toggle "
+ data-toggle="dropdown"><span id="navJournalTitle">PeerJ Journals</span><b class="caret"></b></a><ul class="dropdown-menu journal-list"><li><a href="/">PeerJ Publishing Overview</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">PeerJ – Life & Environment</a><ul class="dropdown-menu"><li><a href="/sections/">About the journal Sections</a></li><li class="divider"></li><li><a href="/sections/aquatic-biology/">Aquatic Biology</a></li><li><a href="/sections/biochemistry-biophysics-molecular-biology/">Biochemistry, Biophysics and Molecular Biology</a></li><li><a href="/sections/biodiversity-conservation/">Biodiversity and Conservation</a></li><li><a href="/sections/bioinformatics-genomics/">Bioinformatics and Genomics</a></li><li><a href="/sections/brain-cognition/">Brain and Cognition</a></li><li><a href="/sections/ecology/">Ecology</a></li><li><a href="/sections/environ-sci/">Environmental Science</a></li><li><a href="/sections/microbiology/">Microbiology</a></li><li><a href="/sections/paleontology-evolutionary-science/">Paleontology and Evolutionary Science</a></li><li><a href="/sections/plant-biology/">Plant Biology</a></li><li><a href="/sections/zoological-science/">Zoological Science</a></li></ul></li><li><a href="/computer-science/">
+ PeerJ Computer Science
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Physical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Organic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Inorganic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Analytical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Materials Science
+ </a></li><li class="divider"></li><li><a href="https://peerj.org/" target="_blank">Visit PeerJ.org and get involved</a></li></ul></li></ul><!-- mobile-only top nav items --><ul class="nav pull-left nav-about-phone hidden-desktop"><li class="dropdown"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><li><!-- checkout items --></li><li><!-- notifications --></li></ul><!-- sections --><ul class="nav pull-left nav-collapse nav-sections nav-sections-main collapse search-hide"><li class="dropdown visible-desktop"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><!-- more --><li class="dropdown"><a href="#" class="dropdown-toggle"
+ data-toggle="dropdown">More <b class="caret"></b></a><ul class="dropdown-menu" role="menu" aria-labelledby="dLabel"><li><a href="/expertrxiv/"><img src="/assets/images/icons/expertrxiv.png" style="width: 80px"/></a></li><li><a href="/subjects/">Subjects</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Search articles</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Peer-reviewed Journals</li><li><a tabindex="-1" href="/articles/?journal=peerj">PeerJ (Life, Biological, Environmental and Health Sciences)</a></li><li><a tabindex="-1" href="/articles/?journal=cs">PeerJ Computer Science</a></li><li><a tabindex="-1" href="/articles/?journal=pchem">PeerJ Physical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ochem">PeerJ Organic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ichem">PeerJ Inorganic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=achem">PeerJ Analytical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=matsci">PeerJ Materials Science</a></li><li role="presentation" class="dropdown-header">Preprints</li><li><a tabindex="-1" href="/preprints/">PeerJ Preprints</a></li></ul></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Table of contents</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Table of Contents - current and archives</li><li><a tabindex="-1" href="/medicine/">PeerJ - Medicine articles</a></li><li><a tabindex="-1" href="/biology/">PeerJ - Biology & Life science articles</a></li><li><a tabindex="-1" href="/environment/">PeerJ - Environmental Science articles</a></li><li><a tabindex="-1" href="/general/">PeerJ - General bio (stats, legal, policy, edu)</a></li><li class="divider"></li><li><a tabindex="-1" href="/cs/">PeerJ Computer Science</a></li><li class="divider"></li><li><a tabindex="-1" href="/preprints-toc/">PeerJ Preprints</a></li></ul></li><li><a href="/academic-boards/advisors/">Academic advisors</a></li><li><a href="/reviewer-match/">Volunteer to review</a></li><li><a href="/collections/">Collections</a></li><li><a href="/questions/">Discussions</a></li><li><a href="https://peerj.com/blog/">Blog</a></li><li><a href="/prepaid-publishing/">Prepaid Publishing</a></li><li><a href="/about/reviews/">Reviews and awards</a></li><li><a href="/spread-the-word/">Spread the word</a></li><li><a href="/about/">Who are we?</a></li><li><a href="/about/contact/">Contact</a></li></ul></li></ul><!-- search --><div class="nav nav-collapse collapse pull-right nav-search"><form class="navbar-search" action="/search/"><input name="q" type="search"
+ data-autocomplete-url="/search/"
+ class="search-query" placeholder="Search"><!--<i class="icon-search"></i>--></form></div><ul class="nav pull-right nav-collapse collapse search-hide nav-utilities"><!-- login desktop --><li><a id="front-page-login" href="/login">Login</a></li></ul><ul class="nav pull-right search-hide nav-shifter"></ul><!-- for authors, my manuscripts --><ul class="nav nav-center nav-collapse collapse search-hide pull-right"><!-- for authors --><li class="dropdown nav-authors"><a href="#" class="dropdown-toggle" data-toggle="dropdown"><i
+ class="icon-info4 icon-large nav-icon icomoon"></i><span class="visible-wide">AUTHORS</span><b class="caret"></b></a><ul class="dropdown-menu"><li><a href="/benefits/">Peer Journals Overview</a></li><li><a href="/about/author-instructions/">Submission Guidelines</a></li><li><a href="/subjects/">Subject Areas</a></li><li><a href="/academic-boards/">Editorial Board</a></li><li><a href="/about/editorial-criteria/">Editorial Criteria</a></li><li><a href="/pricing/">Pricing</a></li><li><a href="/about/FAQ/">General FAQ</a></li><li><a href="/computer-science/faq-cs/">Computer Science FAQ</a></li><li><a href="/about/aims-and-scope/">Aims and Scope</a></li><li><a href="/about/author-interviews/">Author Interviews</a></li><li><a href="/about/policies-and-procedures/">Policies and Procedures</a></li><!--<li><a href="#">Why PeerJ?</a></li>--></ul></li><!-- my manuscripts --><!-- note: dropdown classes used just to maintain display --><li class="nav-manuscripts dropdown"><a href="/new/" class="dropdown-toggle"><span>SUBMIT ARTICLE</span></a></li></ul></div></nav>
+
+ <div class="item-top-navbar">
+ <div class="item-top-navbar-inner">
+ <div class="container-fluid">
+ <div class="row-fluid">
+ <div class="span12">
+ <div class="item-metrics-counts-top-nav article-item-metrics-counts">
+ <span class="article-item-metrics-count visible-all">
+ <span data-count="citations">203</span>
+ <span class="article-item-metrics-label">Citations</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-html">&nbsp;</span>
+ <span class="article-item-metrics-label">Views</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-pdf">&nbsp;</span>
+ <span class="article-item-metrics-label">Downloads</span>
+ </span>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</div>
+
+ <div id="wrap">
+
+
+
+ <div id="nav-pad"></div>
+
+
+ <div class="container">
+
+ <noscript class="js-disabled-warning">
+ <div class="alert alert-danger">
+ <i class="icon icon-warning-sign"></i> Javascript is disabled in your browser. Please <a href="https://www.enable-javascript.com" target="_blank">enable Javascript</a> to view PeerJ.
+ </div>
+ </noscript>
+
+
+ <div class="row publication-jsondata" data-publication-meta="{&quot;publicationId&quot;:&quot;4375&quot;,&quot;Article-section&quot;:&quot;NA&quot;,&quot;journal&quot;:&quot;PeerJ&quot;,&quot;published&quot;:&quot;2018-02-13 08:54:18&quot;,&quot;preprint&quot;:false,&quot;publicationSubjects&quot;:[&quot;Legal Issues&quot;,&quot;Science Policy&quot;,&quot;Data Science&quot;],&quot;publicationInstitutions&quot;:[&quot;Simon Fraser University&quot;,&quot;University of Washington&quot;,&quot;University of Ottawa&quot;],&quot;publicationTop20Institution&quot;:true,&quot;publicationInstitutionPlan&quot;:true}">
+ <!-- Left sidebar -->
+ <div class="span1 article-sidebar">
+ <div class="article-sidebar-left">
+ <div class="sidebar-box sidebar-box--journal">
+ <a href="/" class="sidebar-box--journal-mask"></a>
+ <img src="https://d2pdyyx74uypu5.cloudfront.net/images/article/logos/article-logo-peerj.png">
+ </div>
+
+ <div id="btn-view-tweets" class="sidebar-box sidebar-box--tweet">
+ <div class="text-center">View 618 tweets <i class="icon-twitter"></i></div>
+ </div>
+
+ <a href="#related-research" class="sidebar-box sidebar-box--related text-center">
+ Related research
+ <i class="icon-angle-down"></i>
+ </a>
+
+ <!-- mobile only -->
+ <div class="item-leftside-actions">
+ <div class="sidebar-box sidebar-box--action js-download-modal-trigger">Download</div>
+
+ <div id="notification-actions-mobile" class="sidebar-box sidebar-box--action" data-href="/following/publication/4522/">
+ <span class="follow-btn " id="item-left-follow-btn"
+ title="Receive article updates" data-toggle="tooltip" data-success-modal="#followModal"
+ data-href="/follow/publication/4522/0/">
+ <span class="button_text_follow">Follow</span class="follow-btn publication-label publication-label-general publication-label-middle" id="item-left-follow-btn"
+ ></span>
+</div>
+
+
+
+ <div class="sidebar-box sidebar-box--social visible-desktop">
+ <div class="sidebar-box--social-title">Share</div>
+ <div class="d-flex">
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </div>
+</div>
+
+<div class="btn-group sidebar-box sidebar-box--action">
+ <a href="#" class="btn-share dropdown-toggle" data-toggle="dropdown">Share</a>
+
+ <ul class="dropdown-menu">
+ <li>
+ <a href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+ </ul>
+</div>
+
+ </div>
+
+ </div>
+
+ <div class="peer-reviewed visible-phone">
+ <i class="icon-ok"></i> PEER-REVIEWED
+ </div>
+
+ </div>
+
+ <div id="annotations-sidebar" class="span5"></div>
+
+ <!-- Middle col -->
+ <div id="article-item-middle" class="span7"
+ data-ms-type-entity="articles" data-ms-type-id="research-article" data-ms-type-text="Research-article">
+
+ <div id="article-tweets-container">
+ <div class="row-fluid article-tweets-header">
+ <div class="span9">
+ <h2><em>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</em></h2>
+ </div>
+ <div class="span3">
+ <div class="btn btn-inverse pull-right" id="btn-view-article"><span class="icon-file"></span> View article</div>
+ </div>
+ </div>
+ <div class="tweet-items"> <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1297703289707016194/-sYklkZs_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=164969574" target="_blank"><strong></strong> <span class="twitter-handle">@LorenAndreaEP</span></a>
+ <span class="item-tweet-date">11 days ago</span>
+ </div>
+ <div>RT @AMAldanaS: También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradore…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1293635358064807937/YCE7J6e-_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15271321" target="_blank"><strong>Rachel Borchardt</strong> <span class="twitter-handle">@ButternutSquash</span></a>
+ <span class="item-tweet-date">12 days ago</span>
+ </div>
+ <div>@ces43 May I recommend Piwowar and Priem et al&#039;s article for that topic? https://t.co/Fnm0vtYtKS</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1210228942415814656/L6yRkSyu_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1117109826" target="_blank"><strong>Ana M. Aldana</strong> <span class="twitter-handle">@AMAldanaS</span></a>
+ <span class="item-tweet-date">40 days ago</span>
+ </div>
+ <div>También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradores de 2018 en donde se evidencia la ventaja de publicar en green open access: . https://t.co/1HAmYlfoBP</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/982225468286840837/BM5R0jJh_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=982223918223130624" target="_blank"><strong>Scicomm</strong> <span class="twitter-handle">@ScicommBot</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/856499301358477312/GLL-DiUg_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=850296415708471297" target="_blank"><strong>Open Pharma</strong> <span class="twitter-handle">@_OpenPharma</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">102 days ago</span>
+ </div>
+ <div>@Mietmensch @unpaywall Gotcha. It&#039;s tough to generalize the answer to that, as it depends a lot on the specific journal and field. We dove into the details more in this paper, though: https://t.co/HRus7k3P0B</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">103 days ago</span>
+ </div>
+ <div>@dwhly @unpaywall @hpiwowar historical stats are in here: https://t.co/HRus7k3P0B
+
+prediction for future is here: https://t.co/ex0vvThc9G</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/456347532637896704/We-tZ-rF_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=13616592" target="_blank"><strong>Eric Sieverts</strong> <span class="twitter-handle">@sieverts</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/633201529575632897/5rB4RNtd_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=163244377" target="_blank"><strong>Hector Keun</strong> <span class="twitter-handle">@hectorkeun</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @OxonAndrew: A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact o…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1233869298344611840/suKOWJtS_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1024381399447613443" target="_blank"><strong>Asynchrony</strong> <span class="twitter-handle">@temporalization</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @egonwillighagen: the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJ…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/447652981291614208/RtR2dZtC_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=536409536" target="_blank"><strong>Andrew Singer</strong> <span class="twitter-handle">@OxonAndrew</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles†â¦@thePeerJâ© https://t.co/yCu96hCzMK</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJV72Uf https://t.co/DE9MPIKTdZ</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>@egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for values.</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1220321309411942408/nhm-dSur_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1215236299344502791" target="_blank"><strong>Open Science Community Maastricht</strong> <span class="twitter-handle">@OSCMaastricht</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1263564961068077059/CKFX9dV2_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=371391064" target="_blank"><strong>Marie E McVeigh</strong> <span class="twitter-handle">@JopieNet</span></a>
+ <span class="item-tweet-date">121 days ago</span>
+ </div>
+ <div>@lisalibrarian @ashleydfarley @andy_nobes Usual def of &quot;bronze&quot; in @our_research is free to read, but does not have CC license.
+https://t.co/T34fQja0nN</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">146 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+
+<div class="tweet-pagination pagination">
+
+ <ul>
+
+ <li class="active"><a href="#">1</a></li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2" class="page">2</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=3" class="page">3</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=4" class="page">4</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=5" class="page">5</a>
+ </li>
+
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2">Next</a>
+ </li>
+ </ul>
+
+ <hr>
+</div></div>
+</div>
+ <div id="article-main-container">
+ <div class="article-section-breadcrumb">
+ <span class="icon-angle-left"></span>
+ <span><a href="/"><em>PeerJ</em></a></span>
+ </div>
+
+
+ <div class="hidden-print">
+
+ <div id="article-preexisting" class="well peerj-paper-well" >
+ <i class="icon-pushpin icon-large"></i> Note that a <a href="/preprints/3119/">Preprint of this article</a> also exists, first published August 2, 2017.
+ </div>
+ </div>
+
+ <!-- Main article -->
+ <article itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle"><header class="article-meta front"><h1 class="article-title" itemprop="name headline">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</h1>
+<div class="article-authors">
+<span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-1" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-1" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Heather</span> <span class="surname" itemprop="familyName">Piwowar</span></span></a><a class="corresp" href="mailto:heather@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-2" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-2" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jason</span> <span class="surname" itemprop="familyName">Priem</span></span></a><a class="corresp" href="mailto:jason@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-3" data-jats-contrib-type="author" itemprop="author"><a href="author-3" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Vincent</span> <span class="surname" itemprop="familyName">Larivière</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-2" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-2">2</a>,<a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-4" data-jats-contrib-type="author" itemprop="author"><a href="author-4" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Juan Pablo</span> <span class="surname" itemprop="familyName">Alperin</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-4" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-4">4</a>,<a class="aff xref" href="#aff-5" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-5">5</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-5" data-jats-contrib-type="author" itemprop="author"><a href="author-5" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Lisa</span> <span class="surname" itemprop="familyName">Matthias</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-6" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-6">6</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-6" data-jats-contrib-type="author" itemprop="author"><a href="author-6" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Bree</span> <span class="surname" itemprop="familyName">Norlander</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-7" data-jats-contrib-type="author" itemprop="author"><a href="author-7" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Ashley</span> <span class="surname" itemprop="familyName">Farley</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-8" data-jats-contrib-type="author" itemprop="author"><a href="author-8" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jevin</span> <span class="surname" itemprop="familyName">West</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-9" data-jats-contrib-type="author" itemprop="author"><a href="author-9" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Stefanie</span> <span class="surname" itemprop="familyName">Haustein</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a>,<a class="aff xref" href="#aff-9" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-9">9</a></sup></span>
+</div>
+<div id="article-information">
+<div class="article-notes">
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-1">
+<span class="article-label-container"><a class="article-label">1</a></span><span itemprop="address"><span class="institution">Impactstory</span>, <span class="city">Sanford</span>, <span class="state">NC</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-2">
+<span class="article-label-container"><a class="article-label">2</a></span><span itemprop="address"><span class="institution">École de bibliothéconomie et des sciences de l’information, Université de Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-3">
+<span class="article-label-container"><a class="article-label">3</a></span><span itemprop="address"><span class="institution">Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-4">
+<span class="article-label-container"><a class="article-label">4</a></span><span itemprop="address"><span class="institution">Canadian Institute for Studies in Publishing, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="state">BC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-5">
+<span class="article-label-container"><a class="article-label">5</a></span><span itemprop="address"><span class="institution">Public Knowledge Project</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-6">
+<span class="article-label-container"><a class="article-label">6</a></span><span itemprop="address"><span class="institution">Scholarly Communications Lab, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-7">
+<span class="article-label-container"><a class="article-label">7</a></span><span itemprop="address"><span class="institution">Information School, University of Washington</span>, <span class="city">Seattle</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-8">
+<span class="article-label-container"><a class="article-label">8</a></span><span itemprop="address"><span class="institution">FlourishOA</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-9">
+<span class="article-label-container"><a class="article-label">9</a></span><span itemprop="address"><span class="institution">School of Information Studies, University of Ottawa</span>, <span class="city">Ottawa</span>, <span class="state">ON</span>, <span class="country">Canada</span></span>
+</div>
+</div>
+<dl class="article-identifiers">
+<dt> DOI</dt>
+<dd>
+<a href="https://doi.org/10.7717/peerj.4375" itemprop="sameAs">10.7717/peerj.4375</a><meta itemprop="sameAs" content="info:doi/10.7717/peerj.4375">
+</dd>
+</dl>
+<dl class="article-dates">
+<dt>Published</dt>
+<dd><time itemprop="datePublished">2018-02-13</time></dd>
+<dt>Accepted</dt>
+<dd><time data-itemprop="dateAccepted">2018-01-25</time></dd>
+<dt>Received</dt>
+<dd><time itemprop="dateCreated">2017-08-09</time></dd>
+</dl>
+<dl class="article-editors">
+<dt>Academic Editor</dt>
+<dd itemprop="editor" itemscope="itemscope" itemtype="http://schema.org/Person"><a itemprop="url" href="editor-1" class="contrib" data-jats-contrib-type="editor"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Robert</span> <span class="surname" itemprop="familyName">McDonald</span></span></a></dd>
+</dl>
+<dl class="article-subjects">
+<dt>Subject Areas</dt>
+<dd>
+<a class="subject" itemprop="about" href="/subjects/?filter=Legal%20Issues">Legal Issues</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Science%20Policy">Science Policy</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Data%20Science">Data Science</a>
+</dd>
+<dt>Keywords</dt>
+<dd>
+<span class="kwd" itemprop="keywords">Open access</span>, <span class="kwd" itemprop="keywords">Open science</span>, <span class="kwd" itemprop="keywords">Scientometrics</span>, <span class="kwd" itemprop="keywords">Publishing</span>, <span class="kwd" itemprop="keywords">Libraries</span>, <span class="kwd" itemprop="keywords">Scholarly communication</span>, <span class="kwd" itemprop="keywords">Bibliometrics</span>, <span class="kwd" itemprop="keywords">Science policy</span>
+</dd>
+</dl>
+<dl class="article-license">
+<dt>Copyright</dt>
+<dd>© <span itemprop="copyrightYear">2018</span> <span itemprop="copyrightHolder">Piwowar et al.</span>
+</dd>
+<dt>Licence</dt>
+<dd>
+ <span class="license-p">This is an open access article distributed under the terms of the <a class="ext-link" href="http://creativecommons.org/licenses/by/4.0/" rel="license" data-jats-ext-link-type="uri">Creative Commons Attribution License</a>, which permits unrestricted use, distribution, reproduction and adaptation in any medium and for any purpose provided that it is properly attributed. For attribution, the original author(s), title, publication source (PeerJ) and either DOI or URL of the article must be cited.</span>
+ </dd>
+</dl>
+<dl class="self-citation">
+<dt>Cite this article</dt>
+<dd>
+<span class="self-citation-authors">Piwowar H, Priem J, Larivière V, Alperin JP, Matthias L, Norlander B, Farley A, West J, Haustein S.</span> <span class="self-citation-year">2018</span>. <span class="self-citation-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</span>. <span itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="self-citation-journal" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ</span></span> <span class="self-citation-volume" itemprop="volumeNumber">6</span></span>:<span class="self-citation-elocation" itemprop="pageStart">e4375</span> <a href="https://doi.org/10.7717/peerj.4375" itemprop="url">https://doi.org/10.7717/peerj.4375</a>
+</dd>
+</dl>
+<div class="alert alert-success view-public-reviews">The authors have chosen to make <a href="/articles/4375/reviews/">the review history of this article</a> public.</div>
+</div>
+<div>
+<h2>Abstract</h2>
+<div class="abstract" itemprop="description">
+ <p>Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.</p>
+ </div>
+</div></header><main><div class="body" lang="en">
+ <section class="sec" id="intro">
+ <h2 class="heading">Introduction</h2>
+ <p id="p-1">The movement to provide open access (OA) to all research literature is now over fifteen years old. In the last few years, several developments suggest that after years of work, a sea change is imminent in OA. First, funding institutions are increasingly mandating OA publishing for grantees. In addition to the US National Institutes of Health, which mandated OA in 2008 (<a class="ext-link" href="https://publicaccess.nih.gov/index.htm" data-jats-ext-link-type="uri">https://publicaccess.nih.gov/index.htm</a>), the Bill and Melinda Gates Foundation (<a class="ext-link" href="http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy" data-jats-ext-link-type="uri">http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy</a>), the European Commission (<a class="ext-link" href="http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf" data-jats-ext-link-type="uri">http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf</a>), the US National Science Foundation (<a class="ext-link" href="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf" data-jats-ext-link-type="uri">https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf</a>), and the Wellcome Trust (<a class="ext-link" href="https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy" data-jats-ext-link-type="uri">https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy</a>), among others, have made OA diffusion mandatory for grantees. Second, several tools have sprung up to build value atop the growing OA corpus. These include discovery platforms like ScienceOpen and 1Science, and browser-based extensions like the Open Access Button, Canary Haz, and Unpaywall. Third, Sci-Hub (a website offering pirate access to full text articles) has built an enormous user base, provoking newly intense conversation around the ethics and efficiency of paywall publishing (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.11366.1" title="Looking into Pandora’s Box: the content of Sci-Hub and its usage [version 1; referees: 2 approved, 2 approved with reservations]" data-jats-ref-type="bibr" data-jats-rid="ref-26">Greshake, 2017</a>). Academic social networks like ResearchGate and Academia.edu now offer authors an increasingly popular but controversial solution to author self-archiving (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2016.08.002" title="Hybrid open access—a longitudinal study" data-jats-ref-type="bibr" data-jats-rid="ref-8">Björk, 2016a</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>). Finally, the increasing growth in the cost of toll-access subscriptions, particularly via so-called “Big Deals†from publishers, has begun to force libraries and other institutions to initiate large-scale subscription cancellations; recent examples include Caltech, the University of Maryland, University of Konstanz, Université de Montréal, and the national system of Peru (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>; <a class="xref xref-bibr" href="https://doi.org/10.1038%2Fnature.2016.21223" title="Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals" data-jats-ref-type="bibr" data-jats-rid="ref-41">Schiermeier &amp; Mega, 2017</a>; <a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/" title="When the wolf finally arrives: big deal cancelations in North American Libraries" data-jats-ref-type="bibr" data-jats-rid="ref-1">Anderson, 2017a</a>; <a class="xref xref-bibr" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/" title="Teurer als die Wissenschaft erlaubt" data-jats-ref-type="bibr" data-jats-rid="ref-47">Université Konstanz, 2014</a>). As the toll-access status quo becomes increasingly unaffordable, institutions are looking to OA as part of their “Plan B†to maintain access to essential literature (<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>).</p>
+ <p id="p-2">Open access is thus provoking a new surge of investment, controversy, and relevance across a wide group of stakeholders. We may be approaching a moment of great importance in the development of OA, and indeed of the scholarly communication system. However, despite the recent flurry of development and conversation around OA, there is a need for large-scale, high-quality data on the growth and composition of the OA literature itself. In particular, there is a need for a data-driven “state of OA†overview that is (a) large-scale, (b) up-to-date, and (c) reproducible. This paper attempts to provide such an overview, using a new open web service called oaDOI that finds links to legally-available OA scholarly articles.<a class="xref xref-fn" href="#fn-1" data-jats-ref-type="fn" data-jats-rid="fn-1"><sup>1</sup></a> Building on data provided by the oaDOI service, we answer the following questions:</p>
+ <ol class="list" id="list-1" data-jats-list-type="order">
+ <li class="list-item">
+<p id="p-4">What percentage of the scholarly literature is OA, and how does this percentage vary according to publisher, discipline, and publication year?</p>
+ </li>
+ <li class="list-item">
+<p id="p-5">Are OA papers more highly-cited than their toll-access counterparts?</p>
+ </li>
+ </ol>
+ <p id="p-6">The next section provides a brief review of the background literature for this paper, followed by a description of the datasets and methods used, as well as details on the definition and accuracy of the oaDOI categorization. Results are then presented, in turn, for each research question, and are followed by a general discussion and conclusions.</p>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Literature Review</h2>
+ <p id="p-7">Fifteen years of OA research have produced a significant body of literature, a complete review of which falls outside the scope of this paper (for recent, in-depth reviews, see <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.8460.3" title="The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)" data-jats-ref-type="bibr" data-jats-rid="ref-46">Tennant et al. (2016)</a> and <a class="xref xref-bibr" href="https://doi.org/10.7554%2FeLife.16800" title="How open science helps researchers succeed" data-jats-ref-type="bibr" data-jats-rid="ref-36">McKiernan et al. (2016)</a>. Here we instead briefly review three major topics from the OA literature: defining OA and its subtypes, assessing the prevalence of OA, and examining the relative citation impact of OA.</p>
+ <p id="p-8">Despite the large literature on OA, the term itself remains “somewhat fluid†(Antelman, 2004), making an authoritative definition challenging. The most influential definition of OA comes from the 2002 Budapest Open Access Initiative (BOAI), and defines OA as making content both <i>free to read</i> and <i>free to reuse</i>, requiring the opportunity of OA users to “crawl (articles) for indexing, pass them as data to software, or use them for any other lawful purpose.†In practice, the BOAI definition is roughly equivalent to the popular “CC-BY†Creative Commons license (<a class="xref xref-bibr" href="https://creativecommons.org/licenses/by/4.0/" title="Attribution 4.0 International (CC BY 4.0)" data-jats-ref-type="bibr" data-jats-rid="ref-19">Creative Commons, 2018</a>). However, a number of other sources prefer a less strict definition, requiring only that OA “makes the research literature free to read online†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003" title="The nine flavours of open access scholarly publishing" data-jats-ref-type="bibr" data-jats-rid="ref-51">Willinsky, 2003</a>), or that it is “digital, online, [and] free of charge.†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009" title="Status of open access in the biomedical field in 2005" data-jats-ref-type="bibr" data-jats-rid="ref-34">Matsubayashi et al., 2009</a>). Others have suggested it is more valuable to think of OA as a spectrum (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2016.1182672" title="Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool" data-jats-ref-type="bibr" data-jats-rid="ref-17">Chen &amp; Olijhoek, 2016</a>).</p>
+ <p id="p-9">Researchers have identified a number of subtypes of OA; some of these have near-universal support, while others remain quite controversial. We will not attempt a comprehensive list of these, but instead note several that have particular relevance for the current study.</p>
+ <ul class="list" id="list-2" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-10">Libre OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): extends user’s rights to read and also to reuse literature for purposes like automated crawling, archiving, or other purposes. The Libre OA definition is quite similar to the BOAI definition of OA.</p>
+ </li>
+ <li class="list-item">
+<p id="p-11">Gratis OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): in contrast to Libre, Gratis extends <i>only</i> rights to read articles.</p>
+ </li>
+ <li class="list-item">
+<p id="p-12">Gold OA: articles are published in an “OA journal,†a journal in which all articles are open directly on the journal website. In practice, OA journals are most often defined by their inclusion in the Directory of Open Access Journals (DOAJ) (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>; <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al., 2012</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-13">Green OA: Green articles are published in a toll-access journal, but self-archived in an OA archive. These “OA archives†are either disciplinary repositories like ArXiv, or “institutional repositories (IRs) operated by universities, and the archived articles may be either the published versions, or electronic preprints (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>). Most Green OA articles do not meet the BOAI definition of OA since they do not extend reuse rights (making them Gratis OA).</p>
+ </li>
+ <li class="list-item">
+<p id="p-14">Hybrid OA: articles are published in a subscription journal but are immediately free to read under an open license, in exchange for an an article processing charge (APC) paid by authors (<a class="xref xref-bibr" href="https://doi.org/10.1241%2Fjohokanri.41.678" title="Free internet access to traditional journals" data-jats-ref-type="bibr" data-jats-rid="ref-50">Walker &amp; Soichi, 1998</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-15">Delayed OA: articles are published in a subscription journal, but are made free to read after an embargo period (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=Willinsky&amp;publication_year=2009" title="The access principle: the case for open access to research and scholarship" data-jats-ref-type="bibr" data-jats-rid="ref-52">Willinsky, 2009</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-16">Academic Social Networks (ASN): Articles are shared by authors using commercial online social networks like ResearchGate and Academia.edu. While some include these in definitions of OA (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>), others argue that content shared on ASNs is not OA at all. Unlike Green OA repositories, ASNs do not check for copyright compliance, and therefore as much as half their content is illegally posted and hosted (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-017-2291-4" title="Copyright compliance and infringement in ResearchGate full-text journal articles" data-jats-ref-type="bibr" data-jats-rid="ref-30">Jamali, 2017</a>). This raises concerns over the persistence of content, since, as was the case in October 2017, publishers can and do issue large-scale takedown notices to ASN ordering the removal of infringing content (<a class="xref xref-bibr" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement" title="Publishers take ResearchGate to court, alleging massive copyright infringement" data-jats-ref-type="bibr" data-jats-rid="ref-15">Chawla, 2017</a>). Others have raised questions about the sustainability and ethics of ASN services themselves (<a class="xref xref-bibr" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html" title="A social networking site is not an open access repository" data-jats-ref-type="bibr" data-jats-rid="ref-22">Fortney &amp; Gonder, 2015</a>). Due to these concerns, and inconsistent support from the literature, we exclude ASN-hosted content from our definition of OA.<a class="xref xref-fn" href="#fn-2" data-jats-ref-type="fn" data-jats-rid="fn-2"><sup>2</sup></a> </p>
+ </li>
+ <li class="list-item">
+<p id="p-18">“Black OAâ€: Articles shared on illegal pirate sites, primarily Sci-Hub and LibGen. Although (<a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1096" title="Gold, green, and black open access" data-jats-ref-type="bibr" data-jats-rid="ref-10">Björk, 2017</a>) labels these articles as a subtype of OA, the literature has nearly no support for including Sci-Hub articles in definitions of OA. Given this, we exclude Sci-Hub and LibGen content from our definition of OA.</p>
+ </li>
+ </ul>
+ <p id="p-19">Based on the consensus (and in some cases, lack of consensus) around these definitions and subtypes, we will use the following definition of OA in the remainder of this paper: <b>OA articles are free to read online, either on the publisher website or in an OA repository.</b></p>
+ <section class="sec">
+ <h3 class="heading">Prevalence of OA</h3>
+ <p id="p-20">Many studies have estimated what proportion of the literature is available OA, including <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0011273" title="Open access to the scientific journal literature: situation 2009" data-jats-ref-type="bibr" data-jats-rid="ref-12">Björk et al. (2010)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1186%2F1741-7015-10-124" title="Anatomy of open access publishing: a study of longitudinal development and internal structure" data-jats-ref-type="bibr" data-jats-rid="ref-31">Laakso &amp; Björk (2012)</a>, <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al. (2012)</a>, <a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al. (2013)</a>, <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> and <a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen (2013)</a>. We are not aware of any studies since 2014. The most recent two analyses estimate that more than 50% of papers are now freely available online, when one includes both OA and ASNs. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a>, the most comprehensive study to date, estimates that of papers published between 2011 and 2013, 12% of articles could be retrieved from the journal website, 6% from repositories, and 31% by other mechanisms (including ASNs). <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> also found that the availability of papers published between 1996 and 2011 increased by 4% between April 2013 and April 2014, noting that “backfilling†is a significant contributor to green OA. Their discipline-level analysis confirmed the findings of other studies, that the proportion of OA is relatively high in biomedical research and math, while notably low in engineering, chemistry, and the humanities.</p>
+ <p id="p-21">This <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> study is of particular interest because it used automated web scraping to find and identify OA content; most earlier efforts have relied on laborious manual checking of the DOAJ, publisher webpages, Google, and/or Google Scholar (though see <a class="xref xref-bibr" href="http://arxiv.org/abs/cs/0606079" title="Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact" data-jats-ref-type="bibr" data-jats-rid="ref-27">Hajjem, Harnad &amp; Gingras (2006)</a> for a notable early exception). By using automated methods, Archambault et al. were able to sample hundreds of thousands of articles, greatly improving statistical power and supporting more nuanced inferences. Moreover, by creating a system that indexes OA content, they address a major concern in the world of OA research; as <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a> observes: “A major challenge for research...has been the lack of comprehensive indexing for both OA journals and their articles.†The automated system of <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> is very accurate—it only misclassifies a paper as OA 1% of the time, and finds about 75% of all OA papers that exist online, as per <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. However, the algorithm is not able to distinguish Gold from Hybrid OA. More problematically for researchers, the database used in the study is not open online for use in follow-up research. Instead, the data has since been used to build the commercial subscription-access database 1science (<a class="ext-link" href="http://www.1science.com/oanumbr.html" data-jats-ext-link-type="uri">http://www.1science.com/oanumbr.html</a>).</p>
+ </section>
+ <section class="sec">
+ <h3 class="heading">The open access citation advantage</h3>
+ <p id="p-22">Several dozen studies have compared the citation counts of OA articles and toll-access articles. Most of these have reported higher citation counts for OA, suggesting a so-called “open access citation advantage†(OACA); several annotated bibliographies have been created to track this literature (<a class="xref xref-bibr" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/" title="The open access citation advantage: list of studies until 2015" data-jats-ref-type="bibr" data-jats-rid="ref-43">SPARC Europe, 2015</a>; <a class="xref xref-bibr" href="https://doi.org/10.5062%2FF4Q81B0W" title="Open access citation advantage: an annotated bibliography" data-jats-ref-type="bibr" data-jats-rid="ref-49">Wagner, 2010</a>; <a class="xref xref-bibr" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D" title="The open access citation advantage" data-jats-ref-type="bibr" data-jats-rid="ref-45">Tennant, 2017</a>). The OACA is not universally supported. Many studies supporting the OACA have been criticised on methodological grounds (<a class="xref xref-bibr" href="https://doi.org/10.3163%2F1536-5050.99.3.008" title="The impact of free access to the scientific literature: a review of recent research" data-jats-ref-type="bibr" data-jats-rid="ref-21">Davis &amp; Walters, 2011</a>), and an investigation using the randomized-control trial method failed to find evidence of an OACA (<a class="xref xref-bibr" href="https://doi.org/10.1096%2Ffj.11-183988" title="Open access, readership, citations: a randomized controlled trial of scientific journal publishing" data-jats-ref-type="bibr" data-jats-rid="ref-20">Davis, 2011</a>). However, recent investigations using robust methods have continued to observe an OACA. For instance, <a class="xref xref-bibr" href="https://doi.org/10.1111%2Fecin.12064" title="Identifying the effect of open access on citations using a panel of science journals" data-jats-ref-type="bibr" data-jats-rid="ref-35">McCabe &amp; Snyder (2014)</a> used a complex statistical model to remove confounding effects of author selection (authors may selectively publish their higher-impact work as OA), reporting a small but meaningful 8% OACA. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> describe a 40% OACA in a massive sample of over one million articles using field-normalized citation rates. <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0159614" title="The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)" data-jats-ref-type="bibr" data-jats-rid="ref-38">Ottaviani (2016)</a> used a natural experiment as articles (not selected by authors) emerged from embargoes to become OA, and reports a 19% OACA excluding the author self-selection bias for older articles outside their prime citation years.</p>
+ </section>
+ </section>
+ <section class="sec" id="methods">
+ <h2 class="heading">Methods</h2>
+ <section class="sec">
+ <h3 class="heading">OA determination</h3>
+ <section class="sec">
+ <h4 class="heading">Classifications</h4>
+ <p id="p-23">We classify publications into two categories, OA and Closed. As described above, we define OA as <i>free to read online, either on the publisher website or in an OA repository</i>; all articles not meeting this definition were defined as Closed. We further divide the OA literature into one of four exclusive subcategories, resulting in a five-category classification system for articles:</p>
+ <ul class="list" id="list-3" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-24"><b>Gold</b>: Published in an open-access journal that is indexed by the DOAJ.</p>
+ </li>
+ <li class="list-item">
+<p id="p-25"><b>Green</b>: Toll-access on the publisher page, but there is a free copy in an OA repository.</p>
+ </li>
+ <li class="list-item">
+<p id="p-26"><b>Hybrid</b>: Free under an open license in a toll-access journal.</p>
+ </li>
+ <li class="list-item">
+<p id="p-27"><b>Bronze</b>: Free to read on the publisher page, but without an clearly identifiable license.</p>
+ </li>
+ <li class="list-item">
+<p id="p-28"><b>Closed</b>: All other articles, including those shared only on an ASN or in Sci-Hub.</p>
+ </li>
+ </ul>
+ <p id="p-29">These categories are largely consistent with their use throughout the OA literature, although a few clarifications are useful. First, we (like many other OA studies) do not include ASN-hosted content as OA. Second, categories are exclusive, and publisher-hosted content takes precedence over self-archived content. This means that if an article is posted in both a Gold journal and an OA repository, we would classify it as Gold, not Green. Put another way, publisher-hosted content can “shadow†archived articles that would otherwise be Green. This definition of Green (“available in a repository but <i>not</i> available from the publisherâ€) is often used in the OA literature (including by Steven Harnad, the coiner of the Green and Gold terms <a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>), but this usage is not unanimous. Some studies allow a given article to be <i>both</i> Gold and Green; compared to these, our classification system does undercount Green. Hybrid articles share properties with Gold articles (both are free to read and are licensed for re-use), but differ in the venue of publication (i.e., Hybrid articles are published in journals not considered open access by the DOAJ) and in that Hybrid articles are not necessarily immediately available (i.e., they may only be freely available after an embargo). We also add a novel subcategory, Bronze. Bronze shares attributes of Gold and Hybrid; like both, Bronze OA articles are publisher-hosted. Unlike Gold OA, Bronze articles are not published in journals considered open access in the DOAJ. Unlike Hybrid, Bronze articles carry no license information. Although this lack of identifiable license may not be intentional, without an identifiable license, the articles are free to read but do not allow extended reuse rights beyond reading. It is also not clear if Bronze articles are temporarily or permanently available to read for free.</p>
+ <p id="p-30">Finally, we should add that, although our categories of choice reflect the OA literature, they do not necessarily reflect the more complex reality of scholarly publishing today. Organizations like SciELO and Redalyc in Latin America have been acting simultaneously as publishers and repositories and many of the articles found on their site do not fall neatly into the above categories (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010" title="The SciELO open access: a gold way from the south" data-jats-ref-type="bibr" data-jats-rid="ref-39">Packer, 2010</a>).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">The oaDOI system</h4>
+ <p id="p-31">We assigned the categories above by calling the oaDOI service with a DOI for each item. The oaDOI returns a link to a legally-available OA version of the article, when one is available (<a class="ext-link" href="https://oadoi.org/" data-jats-ext-link-type="uri">https://oadoi.org/</a>). It contains records for all 88 million Crossref DOIs.<a class="xref xref-fn" href="#fn-3" data-jats-ref-type="fn" data-jats-rid="fn-3"><sup>3</sup></a> The oaDOI service crawls, aggregates, normalizes, and verifies data from many sources including PMC (<a class="ext-link" href="https://www.ncbi.nlm.nih.gov/pmc/" data-jats-ext-link-type="uri">https://www.ncbi.nlm.nih.gov/pmc/</a>), BASE (<a class="ext-link" href="https://www.base-search.net/about/en/" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/</a>), DOAJ (<a class="ext-link" href="https://doaj.org/" data-jats-ext-link-type="uri">https://doaj.org/</a>), and thousands of institutional repositories and publishers. The oaDOI system offers a fast, free API with no rate-limits, allowing it to support a variety of other services and tools. At the time of writing, oaDOI processes approximately 500,000 requests daily–roughly twice the daily uses of Sci-Hub<a class="xref xref-fn" href="#fn-4" data-jats-ref-type="fn" data-jats-rid="fn-4"><sup>4</sup></a> (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1" title="Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)" data-jats-ref-type="bibr" data-jats-rid="ref-29">Himmelstein et al., 2017</a>). The majority of this volume comes from around 700 academic libraries, who use oaDOI to help readers find articles where the library has no subscription access, addressing the discoverability problem (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen, 2013</a>). The oaDOI service also powers the Unpaywall browser extension, which helps readers to find legal OA copies of paywalled articles as they browse; Unpaywall currently has over 80,000 active users. The oaDOI codebase is open source, and the service is free and open via an open API.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Accuracy of oaDOI</h4>
+ <p id="p-34">To assess the accuracy of our automated OA determination, a random subsample of 500 articles were chosen from our main “Crossref-DOI†sample, described below. We manually searched the internet for each article in our subsample to determine if the paper was freely available on the publisher’s website, or on another website, such as an institutional repository, an academic social networking site, or on a personal webpage. DOIs were resolved by appending the DOI to “<a class="ext-link" href="https://doi.org/" data-jats-ext-link-type="uri">https://doi.org/</a>â€. If the full text was available through that link, articles were marked as being freely available from the publisher’s site. If articles required a subscription, the title of the article was entered into Google Scholar (GS) and into Google to find alternative versions (i.e., preprints or archived copies). If the fulltext was found on any publisher page or OA repository, these were marked as being freely available from an archive. If the only available open copy was hosted on an academic social network (like Academia.edu or ResearchGate), this was noted but for the sake of the study these were <i>not</i> counted as any category of OA, and were instead added to the “Closed†category;</p>
+ <p id="p-35">The performance of oaDOI is summarized below, compared to these manual accuracy checks. The complete dataset behind this summary is available in supplementary information. Using this data we calculated the recall and precision of the system. “Recall†asks the question, “when an article is open, how often does oaDOI correctly identify it as open?†The recall of the service is 77.0%, meaning that 77% of the truly open articles are correctly identified as open by oaDOI. “Precision†asks the question, “When oaDOI says an article is open, how often is it correct?†The precision of the system is 96.6%, meaning that 96.6% of the time that oaDOI reports an article is open, it really is open.</p>
+ <p id="p-36">These results can be roughly compared to the recall of 86.4% and precision of 99.1% reported by <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> for their automated system. Their accuracy estimate was also calculated based on a sample of 500 data points, giving each estimate a margin of error of ±4.5 percentage points. The Archambault study used a narrower date window for their sample (starting in 1996, versus our Crossref-DOI sample which was not time restricted), resulting in a more homogeneous task, which may partially explain their somewhat better performance.</p>
+ <p id="p-37">The oaDOI service is optimized for high precision, rather than high recall. The very high precision of oaDOI means that any estimates derived from the database can be considered a <i>conservative</i> estimate of the actual percentage of open access in the literature. That is, we can safely assume that when oaDOI reports a certain percentage of open access, the real percentage is <i>at least</i> that high—and almost certainly higher given that recall was less than perfect. Put another way, oaDOI delivers very few false positives (where it mistakenly calls an article open), but a relatively high number of false negatives (where it mistakenly calls an article closed) (<a class="xref xref-table" href="#table-1" data-jats-ref-type="table" data-jats-rid="table-1">Table 1</a>). Future improvements to the system are planned that will improve recall while keeping precision high.</p>
+ <figure class="table-wrap" id="table-1"><div class="caption">
+<span class="caption-label">Table 1: </span>
+ <div class="title">Accuracy of the prototype version of the oaDOI service used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th></th>
+ <th>oaDOI reports Open</th>
+ <th>oaDOI reports Closed</th>
+ <th>Manual count Total (ground truth)</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Open</td>
+ <td>144</td>
+ <td>43</td>
+ <td>187</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>5</td>
+ <td>308</td>
+ <td>313</td>
+ </tr>
+ <tr>
+ <td>Total</td>
+ <td>149</td>
+ <td>351</td>
+ <td style="text-align:left;;">500</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-1</a>
+</div>
+ </figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">Study samples</h3>
+ <p id="p-38">Three samples of DOI-assigned scholarly resources are summarized in <a class="xref xref-table" href="#table-2" data-jats-ref-type="table" data-jats-rid="table-2">Table 2</a> and described further below.</p>
+ <section class="sec">
+ <h4 class="heading">Crossref sample</h4>
+ <p id="p-39">The first sample, “Crossref-DOIs,†is a random sample of 100,000 journal articles with Crossref DOIs, across all publication years. There are approximately 88 million Crossref DOIs in total as of May 2017. In order to exclude books, datasets, and other non-article content, we sampled only items whose “type†was listed as “journal-article†in the Crossref API metadata; there are 66 million of these. To verify the accuracy of Crossref metadata, we manually checked 150 items assigned to type “journal-article,†and determined that 93% were indeed journal articles; the remaining 7% were mostly journal front-matter such as tables of content or instructions to authors.</p>
+ <figure class="table-wrap" id="table-2"><div class="caption">
+<span class="caption-label">Table 2: </span>
+ <div class="title">Summary of samples used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover table-text" data-jats-content-type="text">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Sample name</th>
+ <th>Sample size</th>
+ <th>Population sampled</th>
+ <th>Purpose</th>
+ <th>Population size</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Crossref-DOIs</td>
+ <td>100,000</td>
+ <td>All journal articles with Crossref DOIs, all years.</td>
+ <td>Estimate percentage of the literature that is OA.</td>
+ <td>66,560,153</td>
+ </tr>
+ <tr>
+ <td>WoS-DOIs</td>
+ <td>100,000</td>
+ <td>All citable WoS articles with DOIs, 2009–2015.</td>
+ <td>Estimate citation impact of recent OA papers, and also OA prevalence by discipline.</td>
+ <td>8,083,613</td>
+ </tr>
+ <tr>
+ <td>Unpaywall-DOIs</td>
+ <td>100,000</td>
+ <td>All articles accessed by Unpaywall users over a 1-week period in 2017.</td>
+ <td>Estimate percentage of OA experienced by users of the Unpaywall extension.</td>
+ <td>213,323</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-2</a>
+</div>
+ </figure>
+ <p id="p-40">The purpose of this sample is to roughly proxy the scholarly literature as a whole. As such, it has strengths and weaknesses. One weakness is that although Crossref includes information on citation counts and discipline categorization, we found these to be quite incomplete, and therefore not useful for the present study. Another is that researchers in the scientometrics and OA fields have largely relied on other indexes, particularly Scopus and Web of Science (WoS), to represent the literature as a whole; this makes our results more difficult to compare to previous work. Finally, DOIs are known to be less frequently assigned by publishers in certain disciplines (like humanities; <a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2015.11.008" title="Availability of digital object identifiers (DOIs) in web of science and scopus" data-jats-ref-type="bibr" data-jats-rid="ref-25">Gorraiz et al., 2016</a>), in certain geographic regions (particularly the developing world), and among older articles (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-016-2225-6" title="Availability of digital object identifiers in publications archived by PubMed" data-jats-ref-type="bibr" data-jats-rid="ref-14">Boudry &amp; Chartron, 2017</a>); consequently, these segments will be underrepresented in our sample. This said, Scopus and WoS are also known to underrepresent important segments of the literature (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-015-1765-5" title="The journal coverage of Web of Science and Scopus: a comparative analysis" data-jats-ref-type="bibr" data-jats-rid="ref-37">Mongeon &amp; Paul-Hus, 2016</a>), and so this failing is not limited to Crossref. Moreover, the Crossref sample has important advantages of its own over other indexes. While no sample of the scholarly literature will be complete in every regard, the Crossref index is more expansive than other sources: in July 2017 there were 67 million journal articles indexed in Crossref compared to 30 million in Scopus (<a class="ext-link" href="https://www.elsevier.com/solutions/scopus/content" data-jats-ext-link-type="uri">https://www.elsevier.com/solutions/scopus/content</a>). Also, Crossref has the advantage of being entirely free and open to use, while Scopus and WoS are subscription-access databases; this allows the study data to also be free and open, promoting replication and reuse of our results in further research. However, we did turn to the subscription-access WoS in order to answer questions about the discipline and citation counts of OA articles, since Crossref data is lacking in these areas.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">WoS sample</h4>
+ <p id="p-41">The second sample, “WoS-DOIsâ€, is a random sample of 100,000 journal articles with DOIs that are indexed by Web of Science. The sample was drawn from a local version of the WoS database at the Observatoire des sciences et des technologies (OST) at the Université du Québec à Montréal. Only articles that WoS defines as “citable items†are included in the sample; this excludes non-peer reviewed content such as editorial material and news items. This sample is restricted to articles published between 2009 and 2015, due to DOI availability constraints. The sample of 100,000 articles is randomly drawn from a population of 8 million articles and reviews with a DOI in WoS published between 2009 and 2015 as of May 2017.</p>
+ <p id="p-42">Because the WoS sample is restricted to certain publication years, due to availability of DOIs in the WoS database, this sample is unsuitable for estimating the proportion of the total literature that is OA. However, it is more useful than the Crossref sample in some ways: the WoS sample included accurate discipline information for each article (described below), and also citation counts. Therefore we use the WoS sample to assess OA prevalence by discipline and also the citation impact of recent OA papers. We do not encourage comparisons between the OA percentages in the WoS sample and the Crossref sample, because of large differences in the sampling frames.</p>
+ <p id="p-43">Documents in the WoS-DOIs sample were classified using the National Science Foundation (NSF) journal classification system. This system assigns every journal exactly one “discipline†(a high-level categorization) and exactly one “specialty†(a finer-grained categorization). Because this is a journal-level classification, all articles from a given journal are assigned the same discipline and specialty as the journal. A downside of this approach is that the system classifies multidisciplinary journals (e.g., Nature, PNAS, PLOS ONE) as “biomedical researchâ€, despite their publishing many articles from other fields.<a class="xref xref-fn" href="#fn-5" data-jats-ref-type="fn" data-jats-rid="fn-5"><sup>5</sup></a> In these cases, we used a ground-up, article-by-article classification approach. Each article published in a list of multidisciplinary journals was assigned to the NSF specialty which appeared most frequently in its own reference list. In other words, papers published in multidisciplinary journals were classified at the article level (instead of at the journal level) to the subject area which they cite most frequently.<a class="xref xref-fn" href="#fn-6" data-jats-ref-type="fn" data-jats-rid="fn-6"><sup>6</sup></a> </p>
+ <p id="p-46">We assess the relative impact of open and closed articles, using citations as an indicator of their scholarly impact. There are several properties of articles, however, that can confound this kind of comparison. Chief among these are the article’s discipline (some fields are much more cited than others) and its age (older articles have had more time to gather citations). In order to address this, we computed a normalized expected number of citations for each article, based on its age and its NSF specialty, by comparing it to the average citations for similar articles.<a class="xref xref-fn" href="#fn-7" data-jats-ref-type="fn" data-jats-rid="fn-7"><sup>7</sup></a> </p>
+ <p id="p-48">Using this approach, each article receives an average relative citation (ARC). An ARC of 1.0 indicates that a document was cited according to expectations based on documents published in the same year and NSF specialty, while an ARC above or below 1.0 indicates that the citation impact was above or below world average, respectively. Using these field-normalized citation rates, citation impact can be compared across scientific disciplines as well as across years. We can also compute mean ARCs for groups of articles, like “all open articles†or “all closed articlesâ€, allowing us to compare normalized impact between these two groups. Analyzing results on the level of NSF disciplines, data is not shown for the Humanities (<i>n</i> = 1,091) and Arts (<i>n</i> = 164), because they are underrepresented both in the Web of Science and in terms of DOI coverage.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Unpaywall sample</h4>
+ <p id="p-49">The third sample, “Unpaywall-DOIsâ€, is a random sample of 100,000 articles accessed by users of the free, open-source Unpaywall browser extension, gathered over a one-week time window. We collected IP addresses and DOI requests made to the oaDOI service through the Unpaywall browser extension during the week of June 5–June 11, 2017. In that time period there were 374,703 total accesses, 213,323 unique DOIs, and 42,894 unique IP addresses gathered in total, from which 100,000 unique DOIs were randomly sampled.</p>
+ <p id="p-50">This sample was used to assess the prevalence of OA experienced by users of the Unpaywall extension (since Unpaywall uses oaDOI data to find OA). It is a convenience sample of what articles people are interested in reading, and thereby lets us roughly estimate the percent of this literature that is OA. The sample has serious limitations, however: we don’t know the demographics of Unpaywall users, and we are aware of a bias towards users from the US (as determined by the IP addresses). As such, we cannot accurately generalize the results by education level, discipline, or purpose in reading the scholarly literature.</p>
+ </section>
+ </section>
+ </section>
+ <section class="sec" id="results">
+ <h2 class="heading">Results</h2>
+ <section class="sec">
+ <h3 class="heading">RQ1. What percent of the literature is open access?</h3>
+ <section class="sec">
+ <h4 class="heading">How much of the literature is OA?</h4>
+ <p id="p-51">We found 27.9% (95% CI [27.6–28.2]) of all DOI-assigned journal articles are OA, using the Crossref-DOI sample. Based on this, we estimate there are 18.6 million OA articles with Crossref DOIs (95% CI [18.4–18.8]). This is the total population of OA articles that can be identified and accessed by oaDOI. Given our finding (described in Methods above) that the oaDOI service finds 77% of OA compared to manual searches, we can further estimate that an additional 3.5 million articles are OA but not detectable by this version of oaDOI.</p>
+ <p id="p-52">People reading the literature using the Unpaywall browser extension encounter a significantly higher proportion of OA: we found that 47.0% (95% CI [46.7–47.3]) of the Unpaywall-accessed sample is open access. The main reason for this is article age: since this sample is based on the behavior of actual readers, it is disproportionately comprised of recent articles. In fact, half the accessed articles were published in the last 2 years. Recent articles are much more likely to be OA than their older counterparts (see Results ‘How does Open Access vary by year of publication?’ below).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">What types of Open Access are most common?</h4>
+ <p id="p-53">The proportion of OA by subtype is relatively similar across the samples, as shown in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a> and <a class="xref xref-table" href="#table-3" data-jats-ref-type="table" data-jats-rid="table-3">Table 3</a>. Green OA represents a relatively small percentage of OA articles in all three samples. This is partly because self-archived articles are only counted as Green where there is no publisher-hosted option available; that is, Green OA is sometimes “shadowed†by Gold, Bronze, or Hybrid articles. Bronze is the most common OA subtype in all the samples, which is particularly interesting given that few studies have highlighted its role. We manually inspected a small sample of Bronze articles in order to understand this subcategory more; we found that while many Bronze articles were Delayed OA from toll-access publishers, nearly half were hosted on journals that published 100% of content as free-to-read but were <i>not</i> listed on the DOAJ and did not formally license content (using CC-BY or any other license). Such journals might be better described as “Dark Gold†or “Hidden Gold†than Bronze. A more complete examination of Bronze falls outside the scope of this study, and therefore further investigation will be undertaken in future work.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-1"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 1: Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-small.jpg 355w" data-image-id="fig-1" alt="Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="230"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 1: </span>Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" class="btn btn-mini" download="peerj-4375-fig-1.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-1</a>
+</div>
+</div></figcaption></figure>
+ <figure class="table-wrap" id="table-3"><div class="caption">
+<span class="caption-label">Table 3: </span>
+ <div class="title">Percent of the literature that is OA, by type, in three samples of 100,000 journal articles, with 95% confidence intervals.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Access type</th>
+ <th style="text-align:center;" colspan="2">Crossref-DOI All journal articles with Crossref DOIs, all years. (“Articles with DOIs†in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a>)</th>
+ <th style="text-align:center;" colspan="2">WoS-DOIs All citable WoS articles with DOIs, 2009–2015</th>
+ <th style="text-align:center;" colspan="2">Unpaywall-DOIs All articles accessed by Unpaywall users over a 1-week period in 2017</th>
+ </tr>
+ <tr>
+ <th></th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>OA (all types)</td>
+ <td>27.9%</td>
+ <td>27.6–28.2</td>
+ <td>36.1%</td>
+ <td>36.0–36.2</td>
+ <td>47.0%</td>
+ <td>46.7–47.3</td>
+ </tr>
+ <tr>
+ <td>Bronze OA</td>
+ <td>16.2%</td>
+ <td>16.0–16.5</td>
+ <td>12.9%</td>
+ <td>12.6–13.2</td>
+ <td>15.3%</td>
+ <td>15.0–15.6</td>
+ </tr>
+ <tr>
+ <td>Hybrid OA</td>
+ <td>3.6%</td>
+ <td>3.3–3.9</td>
+ <td>4.3%</td>
+ <td>4.0–4.6</td>
+ <td>8.3%</td>
+ <td>8.0–8.6</td>
+ </tr>
+ <tr>
+ <td>Gold OA</td>
+ <td>3.2%</td>
+ <td>2.9–3.5</td>
+ <td>7.4%</td>
+ <td>7.1–7.7</td>
+ <td>14.3%</td>
+ <td>14.0–14.6</td>
+ </tr>
+ <tr>
+ <td>Green OA</td>
+ <td>4.8%</td>
+ <td>4.5–5.1</td>
+ <td>11.5%</td>
+ <td>11.2–11.8</td>
+ <td>9.1%</td>
+ <td>8.8–9.4</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>72.0%</td>
+ <td>71.8–72.4</td>
+ <td>63.9%</td>
+ <td>63.8–64.0</td>
+ <td>53.0%</td>
+ <td>52.7–53.3</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-3</a>
+</div>
+ </figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by year of publication?</h4>
+ <p id="p-54"><a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Figure 2</a> presents the number (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2A</a>) and proportion (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2B</a>) of papers by access category and publication date. Articles published in the last 20 years are increasingly OA, and this trend shows no sign of slowing. More recent articles are more likely to be OA, with the most recent year examined also containing the most OA: 44.7% of 2015 articles are OA (95% CI [43.3–46.2%]), including 17.6% Bronze (95% CI [16.2–19.1]), 9.4% Hybrid (95% CI [8.0–10.9]), 11.3% Gold (95% CI [9.9–12.8]), and 6.3% Green (95% CI [4.9–7.8]). Well over one million OA papers were published in 2015. This growth trend has largely been driven by dramatic growth in Gold and Hybrid OA since the year 2000. However, more than 20% of papers published before the digital age are also freely available. The majority of these older OA papers are Bronze, and based on their age they are probably more precisely Delayed OA, although additional investigation will be required to confirm this. Bronze OA remains remarkably constant as a proportion of the literature for all publication years examined.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-2"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 2: Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-small.jpg 355w" data-image-id="fig-2" alt="Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="216"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 2: </span>Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" class="btn btn-mini" download="peerj-4375-fig-2.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-2</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-55">The number and proportion of Green papers must be interpreted with particular caution, due to several factors. First, unlike publisher-hosted OA (Gold, Bronze, and Hybrid), the date when the Green article <i>became open</i> is generally different from the date the article was <i>first published</i>. Authors often self-archive articles years after (or before, in the case of preprints) their original publication, leading to so-called “backfilling†of Green stocks (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>). Consequently, the graph cannot show the growth of Green OA over time; this would require longitudinal analysis over several years, and so is outside the scope of this analysis. Instead it shows the number and proportion of Green OA by publication year of the article. Second, many articles cannot be legally self-archived until a certain number of months after publication; this embargoing likely influences the apparent plateau in Green shown in <a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2</a>. Finally, as noted earlier, many self-archived articles would otherwise be Green except for being “shadowed†by a Gold, Bronze, or Hybrid of the same article elsewhere. For more detail on the growth of shadowed Green OA, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Figs. SA2</a> and <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">SA3</a>.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by publisher?</h4>
+ <p id="p-56">We analyzed a subset of the Crossref-DOIs sample by publisher (as listed on the Crossref metadata record) to understand how the extent and types of OA are common across publishers for recent publications (between 2009 and 2015). As we can see in <a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3A</a>, the largest publishers by volume publish the most OA articles by volume, led by Elsevier. As a proportion of all articles published (<a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3B</a>), however, PLOS and Hindawi distinguish themselves as being the only publishers in the top 20 with 100% OA. More than half of the papers published by Oxford University Press, Nature Publishing Group, IOP Publishing, and the American Physical Society (APS) are freely available online. In the case of APS this is largely driven by content available through repositories such as arXiv (for more details on repositories, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>).</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-3"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 3: Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-small.jpg 355w" data-image-id="fig-3" alt="Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="282"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 3: </span>Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" class="btn btn-mini" download="peerj-4375-fig-3.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-3</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary across disciplines?</h4>
+ <p id="p-57">We used the WoS-DOIs sample to examine OA prevalence differences by discipline, because of the easy availability of discipline metadata in the WoS index. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> displays our results. More than half of the publications are freely available in biomedical research and mathematics, while in chemistry and engineering &amp; technology less than 20% of the papers are freely available. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> also highlights the popularity of Green OA in disciplines like physics and mathematics, where more than one fifth of papers are available only through online repositories (mainly arXiv). Hybrid articles are particularly prevalent in mathematics (9.4%), biomedical research (8.1%) and clinical medicine (6.3%), while authors in biomedical research (15.3%), health (11.7%), mathematics (11.2%) and clinical medicine (10.3%) often publish in Gold journals.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-4"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 4: Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-small.jpg 355w" data-image-id="fig-4" alt="Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="241"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 4: </span>Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities).</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" class="btn btn-mini" download="peerj-4375-fig-4.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-4" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-4</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-58">Large variations can also be observed on the more detailed level of NSF specialties (<a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA5</a>). At more than 80% of OA articles, astronomy &amp; astrophysics (87%), fertility (86%), tropical medicine (84%), and embryology (83%) were the specialties where access to literature was the most open. At the other end of the spectrum are pharmacy (7%), inorganic &amp; nuclear chemistry (7%), and chemical engineering (9%), where publications were hidden behind a paywall for more than 90% of papers. More detail on these and other NSF specialties can be seen in <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>.</p>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">RQ2. What is the scholarly impact of open access?</h3>
+ <p id="p-59">Comparing the average relative citation impact of different access categories, the OACA is corroborated: Papers hidden behind a paywall were cited 10% below world average (ARC = 0.90), while those that are freely available obtain, on average, 18% more citations than what is expected (ARC = 1.18). However, citation impact differs between the different manners in which papers are made available for free: those that are only available as Green OA (ARC = 1.33) and Hybrid OA papers (ARC = 1.31) are cited the most with an impact of more than 30% above expectations, those available as Bronze are cited 22% above world average, while papers published as Gold OA obtain an ARC of 0.83. This constitutes an average relative citation impact of 17% below world average and 9% below that of articles hidden behind a paywall. <a class="xref xref-fig" href="#fig-5" data-jats-ref-type="fig" data-jats-rid="fig-5">Figure 5</a> below describes these findings.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-5"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 5: Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w" data-image-id="fig-5" alt="Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="388"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 5: </span>Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" class="btn btn-mini" download="peerj-4375-fig-5.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-5" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-5</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-60">These trends vary over time, however, as shown in <a class="xref xref-fig" href="#fig-6" data-jats-ref-type="fig" data-jats-rid="fig-6">Fig. 6</a>. While the ARC of closed access papers remains below world average throughout the period studied, it increased from .86 in 2009 to .93 over in 2014 and 2015. Meanwhile, when looking across all open types, the mean citation rate is consistently above the world average, fluctuating between 1.15 and 1.22. This fluctuation is guided by differences between the access types, with the impact of Hybrid OA papers increasing over the time period. While Green OA papers’ mean citation rate remain relatively stable, the highest impact, for 2015, is obtained by Bronze and Hybrid. The only form of open for which mean impact has decreased steadily over time is Gold. The results for more recent years are only based on a short citation window, however, and results might change over the next years as citations accumulate.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-6"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 6: Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-small.jpg 355w" data-image-id="fig-6" alt="Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="465"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 6: </span>Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" class="btn btn-mini" download="peerj-4375-fig-6.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-6" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-6</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Discussion and Conclusion</h2>
+ <p id="p-61">Access to scholarly literature is at the heart of current debates in the research community. Research funders are increasingly mandating OA dissemination to their grantees while, at the same time, the growth in toll-access subscriptions costs have prompted more and more university libraries to cancel subscriptions. In this context, several tools have been developed to provide access–both legally and illegally–to scholarly literature. Using data from one of these tools (oaDOI), this paper addresses two broad research questions: what percent of the literature is OA and how does it vary by type of OA, and what is the mean scholarly impact of papers diffused through this form. Three large samples were used, to assess different aspects of OA patterns: (1) 100,000 articles that have a Crossref DOIs, which allows us to assess the relative proportion of OA across all existing literature; (2) 100,000 WoS-indexed journals articles that have a DOI, which allows us to assess the scholarly impact of OA and non OA papers; (3) 100,000 articles accessed by users through the Unpaywall browser extension, which lets us assess the proportion of OA papers found by users of this free tool.</p>
+ <p id="p-62">We found that 28% of all journal articles are freely available online (Crossref-DOI sample). Encouragingly for proponents of OA, this proportion has been growing steadily over the last 20 years, driven particularly by growth in Gold and Hybrid. Articles from 2015, the most recent year examined, had the highest proportion OA (45%), as well as the largest absolute number of OA articles published in a single year. This disproportionate level of OA in recent years, combined with readers’ preference for more recent articles, leads to a felicitous situation for readers: the proportion of OA they <i>experience</i> as they browse and search is better than the overall percentage of OA across the literature as a whole. Users of the Unpaywall browser extension, which gives individual readers access to the oaDOI service, encounter OA articles nearly half (47%) of the time. The effect almost certainly extends beyond Unpaywall users; one may assume readers in general also favor newer articles, and therefore benefit from the growth of Gold, Bronze, and Hybrid OA among recent papers, even without using Unpaywall. More studies of readership data from other sources would be useful to quantify this further.</p>
+ <p id="p-63">Interestingly, we found that the majority of OA articles are Bronze–hosted on publisher websites, either without a license at all or without an open license. This is surprisingly high given that Bronze is relatively little-discussed in the OA literature, and suggests that this OA category deserves further attention from the OA community. In particular, Bronze OA may be significant in a policy context, since, unlike other publisher-hosted OA, Bronze articles do not extend any reuse rights beyond reading, making them Gratis OA. Much more research is needed into the characteristics of Bronze OA. How many Bronze articles are licensed openly, but do not make their license available? Is Bronze disproportionately non-peer-reviewed content? How much of Bronze OA is also Delayed OA? How much Bronze is Promotional, and how transient is the free-to-read status of this content? How many Bronze articles are published in “hidden gold†journals that are not listed in the DOAJ? Why are these journals not defining an explicit license for their content, and are there effective ways to encourage this? These and other questions are outside the scope of this study but may provide fruitful insights for future OA research and policy.</p>
+ <p id="p-64">Only about 7% of the literature overall (and 17% of the OA literature) is Green. This is may at first seem disappointing, given years of advocacy focused on Green OA as well as ongoing growth in the number of Green OA mandates (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014" title="Anatomy of green open access" data-jats-ref-type="bibr" data-jats-rid="ref-11">Björk et al., 2014</a>). However, the full context of Green OA provides reasons for optimism. First, many papers are archived in repositories but are not counted as Green in this analysis because they are also available on the publisher site as Hybrid, Gold, or Bronze versions. These “shadowed Green†copies provide a useful safety net that preserves access in cases where publishers rescind it (as could potentially happen with Delayed OA and other Bronze articles). Further research is needed to determine the prevalence of shadowed Green OA in various disciplines. Second, the phenomenon of “backfilling†(authors self-archiving content published across all years, not just the current one) means that although the percentage graph of Green OA does not show the same year-over-year slope as Gold or Hybrid, the line itself may be rising across <i>all</i> years as authors gradually self-archive papers from years or even decades ago. This assumption is supported by results reported by <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. Finally, the relatively low proportion of green OA encouragingly leaves room for continued growth. While most journals published by major publishers (Elsevier, Wiley, Springer, etc.) allow for self-archiving, research shows that only a small proportion of papers from these publishers actually are self-archived in OA repositories; for example, <a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=" title="Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature" data-jats-ref-type="bibr" data-jats-rid="ref-42">Smith et al. (in press)</a> report using a sample of Global Health Research papers that only 39% of them made use of available self-archiving rights.</p>
+ <p id="p-65">Our results confirm the Open Access Citation Advantage found by other studies: open articles receive 18% more citations than otherwise expected. While at least some of this boost is likely due to the fact that more access allows more people to read and hence cite articles they otherwise would not, causation is difficult to establish and there are many possible confounders. Most discussed is the so-called “selection bias postulateâ€, (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2007.04.001" title="Do open access articles have greater citation impact?" data-jats-ref-type="bibr" data-jats-rid="ref-18">Craig et al., 2007</a>) which suggests that authors choose only their most impactful work to make OA. The current study does not examine the cause or directionality of correlation, but does find that it exists in a very large sample that is relatively representative of the literature as a whole. Funder requirements may also play a role in the observed citation advantage: high-profile funders are more likely to have an OA publishing requirement; at the same time, well funded studies are independently more likely to receive more citations than poorly funded studies (<a class="xref xref-bibr" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/" title="Measuring the scientific output and impact of NIGMS grants" data-jats-ref-type="bibr" data-jats-rid="ref-7">Berg, 2010</a>). Interestingly, Gold articles are actually cited <i>less</i>, likely due to an increase in the number of newer and smaller OA journals. Some of these journals are from regions of the world not historically indexed by WoS, are published in languages other than English, or might be considered to be less prestigious because they have not had time to become established or accumulate citations (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>). On the flip side, the citation disadvantage of Gold OA is likely also affected by the continued growth of so-called ‘mega journals’ such as PLOS ONE (<a class="xref xref-bibr" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication" title="Reviewer guidelines: criteria for publication" data-jats-ref-type="bibr" data-jats-rid="ref-40"> PLOS, 2018</a>). Whatever the reason, the lower impact of Gold means the overall citation advantage is strongly driven by Green, Hybrid, and Bronze content. In sum, while several factors can affect the observed differences in citation rates, and causation remains difficult to establish, the fact remains that scholars are much more likely to read and cite papers to which they have access than those that they cannot obtain. Hopefully the existence of a free, open index of OA content will help support further research into the OACA question.</p>
+ <p id="p-66">The relatively high percentage of OA found in this study, particularly among readers of the free Unpaywall extension, has important potential implications for academic libraries. Increasingly, these libraries are under pressure to meet growing prices of “Big Deal†subscription packages, and the once-unthinkable outcome of canceling these Big Deals is becoming an increasingly realistic option. In this environment, knowing that around half of the literature of interest is available without any subscription may tip the scales toward cancellation for some institutions–particularly given that this percentage seems to be growing steadily. Indeed, the Université de Montréal’s cancellation of their Taylor &amp; Francis subscription package (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>) is particularly interesting, given that their cancellation announcement directly pointed faculty to Unpaywall and other tools to help them access OA content. This may seem a radical suggestion, but cancellation of subscription journals has long been part of the universal OA roadmap (<a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/" title="The forbidden forecast: thinking about open access and library subscriptions" data-jats-ref-type="bibr" data-jats-rid="ref-2">Anderson, 2017b</a>). Even when the percentage of OA is not enough to support outright cancellation, it may be enough to negotiate better subscription rates by supporting calculation of “OA-adjusted Cost Per Access†(<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>). However, much more study is needed to see how OA availability varies across journals and Big Deal packages, along with praxis-oriented work building OA analysis tools that help librarians make cancellation choices.</p>
+ <p id="p-67">This study has several important limitations. Our dataset only includes journal articles with DOIs, which means that disciplines and geographical areas which rely more heavily on conference papers or articles without DOIs are underrepresented. Our Crossref sample includes about 7% journal “front matter†that the journal has assigned a DOI and Crossref labelled “journal article†but is actually a page describing the journal Editorial Board or similar. Our Bronze OA category includes articles published in OA journals which aren’t indexed in DOAJ; future work must identify these OA journals and classify such articles as Gold. As discussed in our definition of OA, when finding open copies we ignored free-to-read articles from academic social networks like ResearchGate and Academia.edu. The oaDOI system has some coverage of articles published on personal web pages, but this is quite limited compared to web-scale indexes like Google. The oaDOI system includes thousands of institutional and subject repositories, but there are some repositories that it misses. Our accuracy checks suggest that oaDOI, and therefore this study, are probably overlooking around 23% of OA otherwise discoverable using web searches, meaning that estimates in reported in this paper undercount OA by approximately 30%. Finally, our approach did not detect <i>when</i> articles were deposited into repositories. Because repositories are often backfilled with content that has been published many years ago, this study does not measure any increase/decrease in prevalence of Green OA over time, but only the proportion of Green OA by article publication date at the moment of data collection.</p>
+ <p id="p-68">In addition to the empirical results obtained, this paper clearly shows the potential of the oaDOI service for future research. The freely available oaDOI service provides scholars with the basis for assessing and monitoring the development of access to scholarly literature on a large scale, as well as the factors that affect it. For instance, our results show that the percentage of the literature available as OA is growing, and that articles diffused through this form are generally more cited than closed access articles. Several factors are likely to contribute to these trends; however, those remain poorly understood. Combined with other datasets–such as the WoS, Scopus, or Crossref–oaDOI allows one to assess at a large-scale the effects of various mandates on deposit rates, or to track the development of documents’ accessibility to determine, for example, when authors self-archive, or the sustainability of the promotional OA category. Aggregated at the level of journals and publishing platforms, these data can also provide librarians with indicators to help inform subscription cancellations and mitigate their effects. The application of the oaDOI algorithm on a large scale also allows for more complete analysis of the OA citation advantage across fields and time. As in <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0013636" title="Self-selected or mandated, open access increases citation impact for higher quality research" data-jats-ref-type="bibr" data-jats-rid="ref-23">Gargouri et al. (2010)</a>, confounding factors could be mitigated by using article-level metadata to identify article pairs published in the same journal issue, on the same topic or published by the same authors at the same time. We hope that other scholars will dig deeper in those data to better understand OA dissemination and the factors that drive it. This is of utmost importance for the future of scholarly communication.</p>
+ </section>
+ <section class="sec" id="supplemental-information">
+ <h2 class="heading"> Supplemental Information</h2>
+ <div class="supplementary-material well well-small" id="supp-1" data-jats-mimetype="application" data-jats-mime-subtype="vnd.openxmlformats-officedocument.wordprocessingml.document">
+<h3 class="heading">Additional results</h3>
+
+ <div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/supp-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/supp-1</a>
+</div>
+<div><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/appendix.docx" class="btn article-supporting-download" data-rel="supplement" download="appendix.docx" data-filename="appendix.docx"><i class="icon-large icon-download-alt"> </i> Download</a></div>
+</div>
+ </section>
+ </div>
+<div id="article-footnotes">
+<div class="fn article-footnote" id="fn-1"><span class="p">In the interest of full disclosure, it should be noted that two of the authors of the paper are the co-founders of Impactstory, the non-profit organization that developed oaDOI.</span></div>
+<div class="fn article-footnote" id="fn-2"><span class="p">Repositories that were included are those covered by the Bielefeld Academic Search Engine (BASE) in May 2017. A full listing of repositories can be found on their website at: <a class="ext-link" href="https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1</a>
+ </span></div>
+<div class="fn article-footnote" id="fn-3"><span class="p">DOIs are short, unique identifiers for scholarly papers. Crossref is a nonprofit that helps a the DOI system, and is by far the largest supplier of academic DOIs in academia.</span></div>
+<div class="fn article-footnote" id="fn-4"><span class="p">Based on a Sci-Hub dataset released in 2016 (the most recent data available).</span></div>
+<div class="fn article-footnote" id="fn-5"><span class="p">These journals were identified by selecting journals with over a one thousand articles per year from those classified in the general “biomedical research†category. The full list of journals meeting these criteria were: PLOS ONE, Nature, Science, Scientific Reports, PNAS, Nature Communication, PeerJ, and Science Advances.</span></div>
+<div class="fn article-footnote" id="fn-6"><span class="p">Ties between frequently cited specialties were resolved randomly; that is, if a paper cites exactly the same amount of papers from two NSF specialties, it was assigned to one of the two at random</span></div>
+<div class="fn article-footnote" id="fn-7"><span class="p">Citations were normalized using the population of WoS articles and reviews with a DOI.</span></div>
+</div></main><footer class="back">
+ <section class="ack" id="acknowledgements"><h2 class="heading">Acknowledgements</h2>
+ <p>The authors would like to thank Dorothea Salo, Kristin Antelman, and John Sack for extensive and valuable comments on a draft of this article. The author order of JP and HP was determined by coin flip, as is their custom.</p>
+ </section>
+ <div class="sec" id="additional-information">
+ <h2 class="heading">Additional Information and Declarations</h2>
+ <div class="fn-group" data-jats-content-type="competing-interests">
+ <h3 class="heading">Competing Interests</h3>
+<div class="fn" id="conflict-1" data-jats-fn-type="conflict"><p>Heather Piwowar and Jason Priem are founders of Impactstory, a non-profit company which makes Unpaywall, oaDOI, and other tools to improve scholarly communication.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="author-contributions">
+ <h3 class="heading">Author Contributions</h3>
+<div class="fn" id="contribution-1" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-1" data-jats-ref-type="contrib" data-jats-rid="author-1">Heather Piwowar</a>, <a class="xref xref-contrib" href="#author-2" data-jats-ref-type="contrib" data-jats-rid="author-2">Jason Priem</a> and <a class="xref xref-contrib" href="#author-9" data-jats-ref-type="contrib" data-jats-rid="author-9">Stefanie Haustein</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, prepared figures and/or tables, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-2" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-3" data-jats-ref-type="contrib" data-jats-rid="author-3">Vincent Larivière</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-3" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-4" data-jats-ref-type="contrib" data-jats-rid="author-4">Juan Pablo Alperin</a> conceived and designed the experiments, performed the experiments, analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-4" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-5" data-jats-ref-type="contrib" data-jats-rid="author-5">Lisa Matthias</a> performed the experiments, analyzed the data, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-5" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-6" data-jats-ref-type="contrib" data-jats-rid="author-6">Bree Norlander</a> analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-6" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-7" data-jats-ref-type="contrib" data-jats-rid="author-7">Ashley Farley</a> wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-7" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-8" data-jats-ref-type="contrib" data-jats-rid="author-8">Jevin West</a> reviewed drafts of the paper.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="other">
+ <h3 class="heading">Data Availability</h3>
+<div class="fn" id="addinfo-1">
+<p>The following information was supplied regarding data availability:</p>
+ <p>Zenodo: <a class="ext-link" href="http://doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://doi.org/10.5281/zenodo.837902</a>.</p>
+ <p>The datasets behind the analysis in this paper are openly available at <a class="ext-link" href="http://dx.doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://dx.doi.org/10.5281/zenodo.837902</a> and the R statistics code can be found at <a class="ext-link" href="https://github.com/Impactstory/oadoi-paper1" data-jats-ext-link-type="uri">https://github.com/Impactstory/oadoi-paper1</a>. The oaDOI code is open source at <a class="ext-link" href="https://github.com/impactstory/oadoi" data-jats-ext-link-type="uri">https://github.com/impactstory/oadoi</a> and information about accessing the oaDOI API and full dataset is at <a class="ext-link" href="https://oadoi.org/api" data-jats-ext-link-type="uri">https://oadoi.org/api</a>.</p>
+</div>
+</div>
+ <h3 class="heading">Funding</h3>
+<p>The authors received no funding for this work.</p>
+</div>
+ <section class="ref-list-container" id="references"><h2 class="heading">References</h2>
+<ul class="ref-list" data-jats-content-type="authoryear">
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-1">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017a</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">When the wolf finally arrives: big deal cancelations in North American Libraries</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2018-01-09">09 January 2018</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-2">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017b</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">The forbidden forecast: thinking about open access and library subscriptions</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-07-15">15 July 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-3">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Antelman</span> <span class="given-names" itemprop="givenName">K</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf">Leveraging the growth of open access in library collection decision making</a>.</cite> In: <span itemprop="name"><a class="conf-name" target="_blank" href="https://scholar.google.com/scholar_lookup?title=Proceeding%20from%20ACRL%202017:%20at%20the%20helm:%20leading%20transformation&amp;author=&amp;publication_year=2017">Proceeding from ACRL 2017: at the helm: leading transformation</a>.</span><span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-4">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <span class="article-title"> <span class="source">Proportion of open access peer-reviewed papers at the European and world levels–2004–2011</span>. </span><span class="institution">European Commission, Brussels</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-5">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">AF</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <span class="article-title"> <span class="source">Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013</span>. </span><span class="institution">European Commission</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-6">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Côté</span> <span class="given-names" itemprop="givenName">G</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Struck</span> <span class="given-names" itemprop="givenName">B</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Voorons</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom">Research impact of paywalled versus open access papers</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-7">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Berg</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">Measuring the scientific output and impact of NIGMS grants</a>.</cite> <span> <span class="comment">NIGMS Feedback Loop Blog [Blog post]. <a class="uri" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/</a>
+ </span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-8">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2016a</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2016.08.002">Hybrid open access—a longitudinal study</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">919</span>-<span class="lpage" itemprop="pageEnd">932</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-9">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B-C</span></span>.</b> <b class="year" itemprop="datePublished">2016b</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1021">The open access movement at a crossroad: are the big publishers and academic social media taking over?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">29</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">131</span>-<span class="lpage" itemprop="pageEnd">134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-10">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1096">Gold, green, and black open access</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">30</b></span>:<span class="fpage" itemprop="pageStart">173</span>-<span class="lpage" itemprop="pageEnd">175</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-11">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paetau</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014">Anatomy of green open access</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Association for Information Science and Technology</span></span> <b itemprop="volumeNumber">65</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">237</span>-<span class="lpage" itemprop="pageEnd">250</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-12">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Majlender</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Guðnason</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0011273">Open access to the scientific journal literature: situation 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e11273</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-13">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bohannon</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1126%2Fscience.352.6285.508">Who’s downloading pirated papers? Everyone</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science</span></span> <b itemprop="volumeNumber">352</b></span>(<span itemprop="issueNumber">6285</span>)</span>:<span class="fpage" itemprop="pageStart">508</span>-<span class="lpage" itemprop="pageEnd">512</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-14">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Boudry</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chartron</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-016-2225-6">Availability of digital object identifiers in publications archived by PubMed</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics March</span></span> <b itemprop="volumeNumber">110</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">1453</span>-<span class="lpage" itemprop="pageEnd">1469</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-15">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chawla</span> <span class="given-names" itemprop="givenName">D</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement">Publishers take ResearchGate to court, alleging massive copyright infringement</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science News</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-16">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F19322909.2013.795426">Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Web Librarianship</span></span> <b itemprop="volumeNumber">7</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">243</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-17">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Olijhoek</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2016.1182672">Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">42</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">108</span>-<span class="lpage" itemprop="pageEnd">115</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-18">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Craig</span> <span class="given-names" itemprop="givenName">ID</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Plume</span> <span class="given-names" itemprop="givenName">AM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McVeigh</span> <span class="given-names" itemprop="givenName">ME</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Pringle</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amin</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2007</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2007.04.001">Do open access articles have greater citation impact?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">1</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">239</span>-<span class="lpage" itemprop="pageEnd">248</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-19">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Creative Commons</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://creativecommons.org/licenses/by/4.0/">Attribution 4.0 International (CC BY 4.0)</a></cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-20">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1096%2Ffj.11-183988">Open access, readership, citations: a randomized controlled trial of scientific journal publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">FASEB Journal</span></span> <b itemprop="volumeNumber">25</b></span>:<span class="fpage" itemprop="pageStart">2129</span>-<span class="lpage" itemprop="pageEnd">2134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-21">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walters</span> <span class="given-names" itemprop="givenName">WH</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.3163%2F1536-5050.99.3.008">The impact of free access to the scientific literature: a review of recent research</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">99</b></span>:<span class="fpage" itemprop="pageStart">208</span>-<span class="lpage" itemprop="pageEnd">217</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-22">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fortney</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gonder</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <span class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html">A social networking site is not an open access repository</a>. <span class="source">Office of Scholarly Communication</span>. </span><span class="institution">University of California</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-23">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0013636">Self-selected or mandated, open access increases citation impact for higher quality research</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">10</span>)</span>:<span class="fpage" itemprop="pageStart">e13636</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-24">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/1206.3664">Green and gold open access percentages and growth, by discipline</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-25">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gorraiz</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Melero-Fuentes</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gumpenbergera</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Valderrama-Zuriánc</span> <span class="given-names" itemprop="givenName">J-C</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2015.11.008">Availability of digital object identifiers (DOIs) in web of science and scopus</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">98</span>-<span class="lpage" itemprop="pageEnd">109</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-26">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greshake</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.11366.1">Looking into Pandora’s Box: the content of <i>Sci-Hub</i> and its usage [version 1; referees: 2 approved, 2 approved with reservations]</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000Research</span></span> <b itemprop="volumeNumber">6</b></span> <span class="comment">Article 541</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-27">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>.</b> <b class="year" itemprop="datePublished">2006</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/cs/0606079">Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-28">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Vallières</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hitchcock</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Oppenheim</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hilf</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2008.10765150">The access/impact problem and the green and gold roads to open access: an update</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">34</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">36</span>-<span class="lpage" itemprop="pageEnd">40</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-29">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Himmelstein</span> <span class="given-names" itemprop="givenName">DS</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Romero</span> <span class="given-names" itemprop="givenName">AR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McLaughlin</span> <span class="given-names" itemprop="givenName">SR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tzovaras</span> <span class="given-names" itemprop="givenName">BG</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greene</span> <span class="given-names" itemprop="givenName">CS</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1">Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)</a></cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ Preprints</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-30">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jamali</span> <span class="given-names" itemprop="givenName">HR</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-017-2291-4">Copyright compliance and infringement in ResearchGate full-text journal articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">112</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">241</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-31">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1186%2F1741-7015-10-124">Anatomy of open access publishing: a study of longitudinal development and internal structure</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Medicine</span></span> <b itemprop="volumeNumber">10</b></span> <span class="comment">Article 124</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-32">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fasi.22856">Delayed open access: an overlooked high-impact category of openly available scientific literature</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the American Society for Information Science and Technology</span></span> <b itemprop="volumeNumber">64</b></span>(<span itemprop="issueNumber">7</span>)</span>:<span class="fpage" itemprop="pageStart">1323</span>-<span class="lpage" itemprop="pageEnd">1329</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-33">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bukvova</span> <span class="given-names" itemprop="givenName">H</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nyman</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0020961">The development of open access journal publishing from 1993 to 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">6</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e20961</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-34">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Matsubayashi</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kurata</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Sakai Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mine</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ueda</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009">Status of open access in the biomedical field in 2005</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">97</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">4</span>-<span class="lpage" itemprop="pageEnd">11</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-35">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McCabe</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Snyder</span> <span class="given-names" itemprop="givenName">C</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1111%2Fecin.12064">Identifying the effect of open access on citations using a panel of science journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Economic Inquiry</span></span> <b itemprop="volumeNumber">52</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">1284</span>-<span class="lpage" itemprop="pageEnd">1300</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-36">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McKiernan</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bourne</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brown</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Buck</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kenall</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Lin</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McDougall</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nosek</span> <span class="given-names" itemprop="givenName">BA</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ram</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soderberg</span> <span class="given-names" itemprop="givenName">CK</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName"> Spies</span> <span class="given-names" itemprop="givenName"> JR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Updegrove</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Woo</span> <span class="given-names" itemprop="givenName">KH</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Yarkoni</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rodgers</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7554%2FeLife.16800">How open science helps researchers succeed</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">eLife</span></span> <b itemprop="volumeNumber">5</b></span>:<span class="elocation-id" itemprop="pageStart">e16800</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-37">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paul-Hus</span> <span class="given-names" itemprop="givenName">A</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-015-1765-5">The journal coverage of Web of Science and Scopus: a comparative analysis</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">106</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">213</span>-<span class="lpage" itemprop="pageEnd">228</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-38">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ottaviani</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0159614">The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">11</b></span>(<span itemprop="issueNumber">8</span>)</span>:<span class="fpage" itemprop="pageStart">e0159614</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-39">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Packer</span> <span class="given-names" itemprop="givenName">AL</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010">The SciELO open access: a gold way from the south</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Canadian Journal of Higher Education</span></span> <b itemprop="volumeNumber">39</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">111</span>-<span class="lpage" itemprop="pageEnd">126</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-40">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">PLOS</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication">Reviewer guidelines: criteria for publication</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-41">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Schiermeier</span> <span class="given-names" itemprop="givenName">Q</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mega</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1038%2Fnature.2016.21223">Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Nature News</span></span> <b itemprop="volumeNumber">541</b></span>(<span itemprop="issueNumber">7635</span>)</span>:<span class="fpage" itemprop="pageStart">13</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-42">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Smith</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Haustein</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fei</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ridde</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>.</b></span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=">Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Health Research Policy and System</span></span> <span class="comment">In Press</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-43">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">SPARC Europe</span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/">The open access citation advantage: list of studies until 2015</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-44">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Suber</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://dash.harvard.edu/handle/1/4322580">Gratis and libre open access</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">SPARC Open Access Newsletter, 124</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-45">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D">The open access citation advantage</a>.</cite> <span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-08-02">2 August 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-46">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">JP</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Waldner</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jacques</span> <span class="given-names" itemprop="givenName">DC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Masuzzo</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Collister</span> <span class="given-names" itemprop="givenName">LB</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hartgerink</span> <span class="given-names" itemprop="givenName">CH</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.8460.3">The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000 Research</span></span> <b itemprop="volumeNumber">5</b></span> <span class="comment">Article 632</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-47">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Universitat Konstanz</span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/">Teurer als die Wissenschaft erlaubt</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-48">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Université de Montréal</span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm">UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-49">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Wagner</span> <span class="given-names" itemprop="givenName">AB</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.5062%2FF4Q81B0W">Open access citation advantage: an annotated bibliography</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Issues in Science and Technology Librarianship</span></span> <b itemprop="volumeNumber">60</b></span>:<span class="fpage" itemprop="pageStart">2</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-50">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walker</span> <span class="given-names" itemprop="givenName">TJ</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soichi</span> <span class="given-names" itemprop="givenName">transl. T</span></span>.</b> <b class="year" itemprop="datePublished">1998</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1241%2Fjohokanri.41.678">Free internet access to traditional journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Information Processing and Management</span></span> <b itemprop="volumeNumber">41</b></span>(<span itemprop="issueNumber">9</span>)</span>:<span class="fpage" itemprop="pageStart">678</span>-<span class="lpage" itemprop="pageEnd">694</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-51">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2003</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003">The nine flavours of open access scholarly publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Postgraduate Medicine</span></span> <b itemprop="volumeNumber">49</b></span>:<span class="fpage" itemprop="pageStart">263</span>-<span class="lpage" itemprop="pageEnd">267</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Book" id="ref-52">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite class="article-title"></cite> <span itemprop="name"><a class="source" target="_blank" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=&amp;publication_year=2009">The access principle: the case for open access to research and scholarship</a></span><span> (<span class="edition">1 edition</span>). Cambridge: <span class="publisher">MIT Press</span>. </span>
+</div></li>
+</ul></section>
+ </footer></article>
+ </div>
+
+
+ <div id="related-research"></div>
+
+ <!-- annotations -->
+ <ul class="nav nav-tabs annotation-tabs-nav">
+ <li class="active"><a href="#questions" data-toggle="tab"><i class="icon-comments"></i> Questions
+ <span class="annotation-counter annotation-counter-questioning"></span></a></li>
+ <li><a href="#links" data-toggle="tab"><i class="icon-link"></i> Links
+ <span class="annotation-counter annotation-counter-linking"></span></a></li>
+ </ul>
+
+ <div class="tab-content annotation-tab-content">
+ <div class="tab-pane active" id="questions">
+ <div class="annotations" id="questions" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/questions/index.html?target=articles/4375&amp;_sort=score">Questions</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-question"
+ data-toggle="annotation-form"
+ data-target="#annotation-question-create-container"
+ rel="nofollow"
+ href="/questions.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Ask a question</a>
+ <div class="help-block annotation-learn-more"><a href="/about/FAQ/academic-contribution/" target="_blank">Learn more about Q&amp;A</a></div>
+ <div class="annotation-form-container"
+ id="annotation-question-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+
+ <div class="tab-pane" id="links">
+ <div class="annotations" id="links" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/links/index.html?target=articles/4375&amp;_sort=score">Links</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-link"
+ data-toggle="annotation-form"
+ data-target="#annotation-link-create-container"
+ rel="nofollow"
+ href="/links.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Add a link</a>
+ <div class="annotation-form-container"
+ id="annotation-link-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+
+ <div class="hidden-desktop" id="mobile-featured-jobs"></div>
+ </div>
+
+ <!-- Right sidebar -->
+ <div class="span3 offset1 article-sidebar visible-desktop">
+ <div id="article-sidebar-main-content" data-todo-href="/todos/19698/">
+ <div class="dimensions-stats-container">
+ <span class="__dimensions_badge_embed__" data-doi="10.7717/peerj.4375" data-hide-zero-citations="true" data-legend="always" data-style="small_circle"></span>
+ </div>
+
+
+ <div class="row-fluid item-action-buttons article-sidebar-item">
+ <div class="span12">
+ <a href="/benefits/" class="author-quote article-author-quote-link">
+ <div class="author-quote-text">
+ <span class="lead-in">I published in PeerJ</span> and it is very fast, has good editors, has consistently given good quality and rigorous reviews of my work, and produces visually appealing manuscripts.</div>
+ <div class="author-quote-details">
+ <span class="author-quote-name">Matthew Jackson</span><br>
+ PeerJ author
+ </div>
+</a> <div class="article-free-publishing-cta">
+ <div class="article-free-publishing-cta-title">Publish Free in 2020</div>
+ <div class="article-free-publishing-cta-subline">In PeerJ Chemistry Journals</div>
+ <a href="https://peerj.com/blog/post/115284881305/free-open-access-publishing-for-chemistry-and-computer-science-subject-areas" class="btn btn-article article-free-publishing-cta-btn">
+ Learn more
+ </a>
+ </div>
+ <div id="download-modal-trigger" class="js-download-modal-trigger btn btn-article btn-download btn-success mb-3 ">
+ Download
+</div> <!--<div class="content-cta-intro-text">Want alerts from articles like this?</div>-->
+<div id="content-alert-link" class="content-alert-link-btn" data-href="/content-alert/?aid=19698">
+ <div id="content-alert-button-label">
+ <i class="icon-envelope btn-content-alert-icon"></i>
+ Content <div class="content-alert-btn-lastword">Alert</div>
+ </div>
+ <div id="content-alert-button-loading" style="display:none;"><i class="icon-spin icon-spinner"></i> Loading...</div>
+</div>
+ <div class="content-cta-help-text">
+ Just enter your email
+ </div>
+ </div>
+ </div>
+
+
+
+
+ <nav class="article-sidebar-block">
+ <div class="sidebar-heading">
+ <i class="icon-wrench"></i> Tools & info
+ </div>
+ <ul class="nav nav-list article-item-metrics-counts" data-src="/articles/4375/counter/">
+ <li>
+ <a href="/articles/4375/reviews/"
+ rel="version-history">Peer Review history</a>
+ </li>
+
+
+ <li><a href="/articles/4375/citations/" data-toggle="modal" data-target="#citing-modal">See citing articles <span class="metric-counter citation-item-count">203</span></a></li>
+
+
+ <li><a href="#questions">Ask questions
+ <span class="metric-counter annotation-counter-questioning"></span></a></li>
+
+ <li><a href="#links">Add links
+ <span class="metric-counter annotation-counter-linking"></span></a></li>
+
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Visitors <span class="metric-counter" data-count="visitors">&nbsp;</span> <span class="pull-right metric-counter-details-cta">click for details</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Views <span class="metric-counter" data-count="views-html">&nbsp;</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Downloads <span class="metric-counter" data-count="views-pdf">&nbsp;</span></a></li>
+
+ <li><a id="item-flag-button" data-toggle="modal" href="#flagModal">Report problem with article</a></li>
+ </ul>
+ </nav>
+
+
+ <div id="related-research-sidebar"></div>
+
+</div>
+<nav class="article-sidebar-block follow" >
+ <div class="sidebar-heading">
+ <i class="icon-list-ul"></i> Outline
+ </div>
+ <div class="article-navigation"></div>
+ <div id="top-return" class="top-return">
+ <i class="icon-arrow-up"></i> Return to top
+ </div>
+
+ <div data-clone="#expertrxiv-related" data-source="/expertrxiv/related/?subjectIds=85%2C87%2C111&amp;subjects=Legal%20Issues%2C%20Science%20Policy%2C%20Data%20Science"></div>
+
+ </nav>
+
+<div class="subjects-navigation"></div>
+
+ <div id="article-identifiers">
+ <span class="article-meta-name">PubMed</span>
+ <a href="https://www.ncbi.nlm.nih.gov/pubmed/29456894"
+ id="article-identifier-pmid" target="_blank">29456894</a>
+ </div>
+ </div>
+ </div>
+
+
+<style>
+ .modal-loading-container{
+ display:flex;
+ justify-content:center;
+ color:#999;
+ padding:3rem;
+ }
+</style>
+
+<div id="download-article-modal" class="modal hide fade peer-review-article" style="">
+
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Download article</h3>
+ </div>
+
+ <div class="modal-body">
+ <div id="download-article-modal-loading" class="modal-loading-container" style="display:none;">
+ <i class="icon-spin icon-3x icon-spinner"></i>
+ </div>
+ <div id="download-article-modal-body">
+ <div id="download-modal-buttons-container">
+ <div class="download-modal-article-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</div>
+ <div class="mt-2 download-buttons">
+ <a target="_blank" download data-format="PDF" data-download-confirm-text="PDF downloaded" href="https://peerj.com/articles/4375.pdf" target="_blank" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> PDF (2.3MB)</a>
+ <a target="_blank" data-download-confirm-text="Mendeley opened" href="http://www.mendeley.com/import/?doi=10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Save to Mendeley</a>
+ <a target="_blank" data-download-confirm-text="Readcube article opened" href="http://www.readcube.com/articles/10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Read in ReadCube</a>
+ <a target="_blank" data-format="RIS" data-download-confirm-text="RIS file downloaded" href="https://peerj.com/articles/4375.ris" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> RIS</a>
+ <a target="_blank" data-format="XML" data-download-confirm-text="XML file downloaded" href="https://peerj.com/articles/4375.xml" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> XML</a>
+ <a target="_blank" data-format="BibText" data-download-confirm-text="BibText file downloaded" href="https://peerj.com/articles/4375.bib" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> BibTeX</a>
+
+ </div>
+ </div>
+
+ <div id="download-modal-downloading-message" style="display:none;">
+ <div class="text-center pt-4 pb-4">
+ <div>
+ <strong>Your download will start in a moment...</strong>
+ </div>
+ <div class="btn btn-secondary mt-4 js-close-download-modal">Close</div>
+ </div>
+ </div>
+
+ <div id="download-modal-signup-container" style="display:none;">
+
+<div class="download-modal-cta-container">
+
+ <div class="download-modal-confirm">
+ <div class="download-modal-confirm-title">
+ <i class="icon-tickcircle downloaded-tick"></i> <span class="download-modal-confirm-title-text"></span>
+ <i class="icon-chevron-down show-download-link"></i>
+ </div>
+ <a class="article-modal-download-url" href=""></a>
+ </div>
+
+
+ <div class="download-modal-cta-subtitle-small mt-2 mb-4 text-center">
+ Subscribe for subject updates
+ </div>
+
+ <div class="section-subscribe-container mb-2" style="display: flex;justify-content:center;">
+ <div>
+ <input type="text" placeholder="Email address" name="email" value="" class="form-control" id="download-subscribe-email">
+ </div>
+ <div class="ml-1">
+ <select name="freq" class="form-control" style="width: 100%;" id="download-subscribe-freq">
+ <option value="daily">Daily</option>
+ <option value="weekly">Weekly</option>
+ </select>
+ </div>
+ </div>
+
+ <div id="download-subscribe-error-container" class="mb-2 text-center text-error" style="display:none;"></div>
+
+
+ <button class="btn btn-primary btn-block btn-large mb-2 btn-modal-cta"
+ style="display: block;"
+ id="download-subscribe-submit"
+ data-url="/content-alert/download-subscribe?aid=19698"
+ data-signed-in=""
+ data-section-name="">
+ Subscribe
+ </button>
+
+ <a href="#" class="btn btn-block btn-link btn-large btn-modal-close js-close-download-modal mb-2">
+ Close
+ </a>
+
+</div>
+
+<script>
+ (function(){
+ $('#download-subscribe-submit').click(function(){
+
+ var button = $(this);
+ var url = button.data('url');
+ if(button.attr('disabled')) return;
+
+ $.get(url, function(response){
+
+ if(!response.token){
+ errorContainer.html('Server error, you have not been subscribed').show();
+ button.html('Subscribe').removeAttr('disabled');
+ return;
+ }
+
+ var errorContainer = $('#download-subscribe-error-container');
+ errorContainer.html('').hide();
+ button.html('<i class="icon-spin icon-spinner"></i>').attr('disabled', true);
+
+ var signedIn = button.data('signed-in');
+ var sectionName = button.data('section-name');
+ var data = {
+ _token: response.token
+ };
+
+ if(!signedIn) {
+ var email = $('#download-subscribe-email').val();
+ data.email = email;
+ data.freq = $('download-subscribe-freq').val();
+ }
+
+ $.ajax({
+ url: url,
+ method: 'POST',
+ data: data
+ }).success(function(response){
+ button.hide();
+ $('.js-close-download-modal').trigger('click');
+
+ PeerJ.Tools.ToastNotifications.add({
+ type: 'success',
+ title: 'Subscribed',
+ text: sectionName ? 'You subscribed to ' + sectionName : 'You subscribed to this article\'s subjects'
+ });
+
+ }).error(function(response){
+ if(response.responseJSON && response.responseJSON.errors){
+ errorContainer.html(response.responseJSON.errors[0]).show();
+ }
+ }).complete(function(){
+ button.html('Subscribe').removeAttr('disabled');
+ });
+
+ });
+ });
+
+ }());
+</script>
+ </div>
+ </div>
+ </div>
+
+ <div class="modal-footer" style="display:none;">
+ <div class="pull-right">
+ </div>
+
+ <span class="submit-copy submit-copy-btn btn cancel pull-left" id="modal-cancel" data-dismiss="modal">
+ Cancel
+ </span>
+ </div>
+</div>
+
+ <div id="ajax-form"></div>
+
+ <!-- Flag Modal -->
+ <div id="flagModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align: center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim">Report a problem</h3>
+ </div>
+
+ <form id="article-flag-form"
+ data-href="/issues/4375/flag/"
+ method="post">
+
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="alert alert-info">
+ <p><strong>Common use cases</strong><br>
+ Typos, corrections needed, missing information, abuse, etc
+ </p>
+
+ <p><strong>Our promise</strong><br>
+ PeerJ promises to address all issues as quickly and professionally as possible. We
+ thank you in advance for your patience and understanding.
+ </p>
+ </div>
+
+ <div id="flag-modal-result" style="margin-left:45px;">
+
+ <div>
+ <label><strong>Type of problem</strong></label>
+ <p>
+ <select id="moderation_flag_category" name="moderation_flag[category]" class="span4"><option value="typo">Typo</option><option value="metadata">Missing or incorrect metadata</option><option value="quality">Quality: PDF, figure, table, or data quality</option><option value="download">Download issues</option><option value="abuse">Abusive behavior</option><option value="misconduct">Research misconduct</option><option value="other">Other issue not listed above</option></select>
+
+ </p>
+ </div>
+ <div>
+ <label><strong>Details</strong> <i class="icon-large icon-question-sign" title="Please be as detailed as possible within the 500 character limit. Any details you provide will not be shown publicly." data-toggle="tooltip"></i></label>
+ <div>
+ <textarea id="moderation_flag_detail" name="moderation_flag[detail]" required="required" maxlength="500" class="span4" placeholder="Enter any details about this issue. Kept confidential with PeerJ staff." rows="5" data-counter-target="#flag-counter"></textarea>
+
+ <div style="margin:10px 0 0 0; color:#777777; float: left; display: block"><span id="flag-counter" class="label">500</span> characters remaining</div>
+ </div>
+ </div>
+
+ </div>
+
+ </div>
+ </form>
+ <div id="flag-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Cancel</button>
+ <input type="submit" class="btn btn-success save-flag-btn" value="Send report">
+ </div>
+</div>
+
+ <!-- Follow Publication Modal -->
+ <div id="followModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align:center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim" id="followModalLabel">Follow this publication for updates</h3>
+ </div>
+
+ <div>
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="row-fluid" style="margin-bottom: 15px">
+ <div class="span1">
+ <i class="icon-large icon-bullhorn"></i>
+ </div>
+ <div class="span11">
+ "Following" is like subscribing to any updates related to a publication.
+ These updates will appear in your home dashboard each time you visit PeerJ.
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span1">
+ <i class="icon-large icon-envelope"></i>
+ </div>
+ <div class="span11">
+ <p>
+ You can also choose to receive updates via daily or weekly email digests.
+ If you are following multiple publications then we will send you
+ no more than one email per day or week based on your preferences.
+ </p>
+ <p>
+ <em>Note: You are now also subscribed to the subject areas of this publication</em>
+ and will receive updates in the daily or weekly email digests if turned on.
+ You can <a href="/settings/details/">add specific subject areas</a> through your profile settings.
+ </p>
+ </div>
+ </div>
+
+ <hr>
+ <div id="follow-modal-result" style="margin-left:-40px;padding-top:7px;">
+ </div>
+
+ </div>
+
+ </div>
+
+ <div id="follow-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+ </div>
+
+ <!-- Unfollow Publication Modal -->
+ <div id="unfollowModal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Change notification settings or unfollow</h3>
+ </div>
+
+ <form id="article-unfollow-form"
+ data-href="/follow/publication/4375/1/"
+ method="put" class="form-horizontal">
+
+
+ <div id="unfollow-form-load-result" class="modal-body" data-href="/follow/publication/4375/edit/" style="max-height:350px;overflow-y:auto">
+ <p>Loading ...</p>
+ </div>
+
+ </form>
+ <div class="modal-footer">
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ <input type="submit" class="btn btn-success update-follow-btn" value="Update">
+ </div>
+</div>
+
+ <!-- Metrics Modal -->
+ <div id="metricsModal" class="modal hide">
+ <div class="modal-body" style="max-height:330px;overflow-y:auto">
+
+ <div class="row-fluid">
+ <div class="span12">
+ <p class="leadh2">Usage since published - updated daily</p>
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span8">
+ <h3 style="margin-bottom:10px">Social referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Twitter</div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Facebook</div>
+ <div class="span3" style="text-align:right;min-height:0">676</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Reddit</div>
+ <div class="span3" style="text-align:right;min-height:0">15</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">LinkedIn</div>
+ <div class="span3" style="text-align:right;min-height:0">11</div>
+ </div>
+
+ <h3 style="margin:30px 0 10px 0">Top referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From bookmark or typed URL
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">30,876</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Google search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">5,439</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Twitter
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From PeerJ Content Alert Emails
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">32</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Yahoo search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">20</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Webmail
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">3</div>
+ </div>
+ </div>
+
+ <div class="span4" style="overflow-x:hidden;">
+ <h3 style="margin-bottom:10px">Share this publication</h3>
+
+
+
+ <ul class="unstyled">
+ <li>
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+</ul>
+ <h3 style="margin-bottom:10px;margin-top:10px">Metrics</h3>
+
+ <!-- Altmetric -->
+ <div class="altmetric-embed" data-badge-popover="right"
+ data-link-target="_blank" data-doi="10.7717/peerj.4375"></div>
+ </div>
+ </div>
+
+ </div>
+
+ <div class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Wiki Modal -->
+
+ <!-- Links Modal -->
+ <div class="modal hide fade" id="article-links-modal">
+ <div class="modal-header">
+ <a rel="nofollow" data-dismiss="modal" aria-hidden="true" class="close">&times;</a>
+
+ <h3 class="modal-title">Links</h3>
+ </div>
+
+ <div class="modal-body"></div>
+
+ <div class="modal-footer">
+ <a rel="nofollow" href="/links.form?target=articles/4375" class="btn btn-primary">Add a link</a>
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Citing Modal -->
+ <div id="citing-modal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h2 class="slim"><i class="icon-copy"></i> Articles citing this paper</h2>
+ </div>
+ <div class="modal-body">Loading citing articles… <i class="icon icon-spinner icon-spin"></i></div>
+</div>
+
+ <!-- Graphical abstract modal -->
+
+ </div>
+
+
+ <div id="push"></div>
+ </div>
+
+ <footer id="footer">
+ <div class="foot">
+ <div class="container">
+
+ <div class="row">
+ <div class="span7">
+ <b>About us -</b> <a href="/about/" class="aboutLink" data-target="team">PeerJ team</a>
+ | <a href="/about/publications/" class="aboutLink" data-target="journals">Our publications</a> |
+ <a href="/benefits/">Benefits</a> | <a
+ href="/about/partnerships/" class="aboutLink" data-target="partnership">Partnerships</a> | <a
+ href="/about/endorsements/" class="aboutLink" data-target="endorsements">Endorsements</a>
+ <i class="icon-trophy"></i> <a href="/about/reviews/" class="aboutLink" data-target="reviews">Awards</a>
+ </div>
+ <div class="span5">
+ <b>Resources -</b> <a href="/about/FAQ/">FAQ</a> | <a
+ href="/about/careers/">Careers</a> | <a href="/about/press/">Press
+ room</a> | <a href="/about/terms/">Terms of use</a> | <a
+ href="/about/privacy/">Privacy</a> | <a
+ href="/about/contact/" class="aboutLink" data-target="contact">Contact</a>
+ </div>
+ <div class="span7">
+ <b>Academic boards -</b> <a href="/academic-boards/advisors/">Advisors</a> | <a
+ href="/academic-boards/editors/">Editors</a> |
+ <a href="/academic-boards/subjects/">Subject areas</a>
+ </div>
+ <div class="span5">
+ <b>Follow us -</b>
+ <a href="https://peerj.com/blog/">PeerJ blog</a> |
+ <a href="http://twitter.com/thePeerJ/" title="Follow on Twitter" data-toggle="tooltip">Twitter</a>
+ |
+ <a href="http://facebook.com/thePeerJ/" title="Follow on Facebook" data-toggle="tooltip">Facebook</a>
+ |
+ <a href="http://www.linkedin.com/company/peerj" title="Follow on LinkedIn" data-toggle="tooltip">LinkedIn</a>
+ |
+ <a href="https://www.instagram.com/thepeerj" title="Follow on Instagram" data-toggle="tooltip">Instagram</a>
+ |
+ <a href="http://www.pinterest.com/thepeerj/boards/" title="Follow on Pinterest" data-toggle="tooltip">Pinterest</a>
+ </div>
+ <div class="span7">
+ <b>Submission guides -</b>
+ <a href="/about/aims-and-scope"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/about/aims-and-scope/cs"><em>PeerJ Computer Science</em></a> |
+ <a href="/about/aims-and-scope/chemistry"><em>PeerJ Chemistry</em></a>
+ </div>
+ <div class="span5">
+ <b>Spread the word</b> -
+ <a href="/spread-the-word/activities/">Activities</a> |
+ <a href="/spread-the-word/resources/">Resources</a>
+ </div>
+ <div class="span7">&nbsp;</div>
+ <div class="span5">
+ <b>PeerJ feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom" rel="alternate" title="Articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1">RSS 1.0</a> |
+ <a href="/articles/index.rss2">RSS 2.0</a> |
+ <a href="/articles/index.json">JSON</a>
+ <br>
+
+ <b>PeerJ Computer Science feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom?journal=cs" rel="alternate" title="PeerJ Computer Science articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1?journal=cs">RSS 1.0</a> |
+ <a href="/articles/index.rss2?journal=cs">RSS 2.0</a> |
+ <a href="/articles/index.json?journal=cs">JSON</a>
+ <br>
+ <b>Archives - </b>
+ <a href="/archives/" rel="archives"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/archives/?journal=cs" rel="archives"><em>PeerJ Computer Science</em></a>
+ </div>
+
+</div>
+
+<div id="fb-root"></div>
+
+ <div class="row" style="margin-top:10px;font-size:12px">
+ <div class="span12" style="color:#888">
+
+ <div>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ</span> ISSN: 2167-8359</span>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ Comput. Sci.</span> ISSN: 2376-5992</span>
+ <span><span style="font-style:italic">PeerJ Preprints</span> ISSN: 2167-9843</span>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+ </footer>
+
+ <div id="alerts" data-async-alerts="/alerts/"></div>
+
+ <script src="/js/8d39319-35fca22.js"></script>
+ <script src="https://cdn.peerj.com/webpack/runtime.bfc7ab93.js"></script><script src="https://cdn.peerj.com/webpack/0.7880a6b6.js"></script><script src="https://cdn.peerj.com/webpack/1.24ea793f.js"></script><script src="https://cdn.peerj.com/webpack/vue-bundle.9bf24d69.js"></script>
+
+
+ <script src="/js/5d3c493-193ec0b.js"></script>
+
+ <script src="/js/c1dacd9-f146d62.js"></script>
+ <!--[if gt IE 8]><!-->
+ <script src="/assets/js/highlight/highlight.pack.js"></script>
+
+ <script>
+ $(function () {
+ // syntax highlighting for code blocks
+ $("pre > code").each(function() {
+ var node = $(this);
+
+ var language;
+
+ // JATS >=1.1
+ language = node.data('jats-language');
+
+ if (!language) {
+ // JATS <1.1
+ language = node.data('jats-preformat-type');
+
+ // ignore default 'code' type
+ if (language === 'code') {
+ language = null;
+ }
+ }
+
+ if (language) {
+ node.addClass('language-' + language);
+ }
+
+ hljs.highlightBlock(this);
+ });
+ });
+ </script>
+ <!--<![endif]-->
+
+ <script>
+ //initialise the follow button
+ $(function() {
+ PeerJ.Event.Follow.init();
+ });
+
+ //Show citations modal if query param exists
+ var urlParams = new URLSearchParams(window.location.search);
+ if(urlParams.has('citations')){
+ $('#citing-modal').modal('show');
+ }
+
+ </script>
+
+
+<script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ messageStyle: "none",
+ imageFont: null,
+ "CommonHTML": {
+ linebreaks: { automatic: true },
+ scale: 95
+ },
+ "HTML-CSS": {
+ linebreaks: { automatic: true },
+ scale: 90
+ },
+ menuSettings: {
+ zoom: "Click"
+ }
+ });
+
+ MathJax.Ajax.config.root = "/bundles/peerjmathjax/MathJax/";
+</script>
+
+<script src="/bundles/peerjmathjax/MathJax/MathJax.js?config=TeX-MML-AM_HTMLorMML,Safe&noContrib"></script>
+
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research"}'></script>
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research-sidebar"}'></script>
+ <script async src="https://badge.dimensions.ai/badge.js" charset="utf-8"></script>
+
+ <div id="content-alert-container"></div>
+
+ <div id="toast-container"></div>
+
+ <div id="vue-notifications"></div>
+
+ <div id="vue-confirm-modal"></div>
+
+ <script>
+ $(PeerJ.Home.Banner.init);
+ </script>
+
+ </body>
+</html>
diff --git a/python/tests/files/plos_one_article.html b/python/tests/files/plos_one_article.html
new file mode 100644
index 0000000..9abfe00
--- /dev/null
+++ b/python/tests/files/plos_one_article.html
@@ -0,0 +1,1707 @@
+
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml"
+ xmlns:dc="http://purl.org/dc/terms/"
+ xmlns:doi="http://dx.doi.org/"
+ lang="en" xml:lang="en"
+ itemscope itemtype="http://schema.org/Article"
+ class="no-js">
+
+
+
+<head prefix="og: http://ogp.me/ns#">
+ <title>Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody</title>
+
+
+
+
+
+
+
+<link rel="stylesheet" href="/plosone/resource/compiled/asset_KWXDCDFJFQKCXTNZJE7SIB7MT43CSDVH.css" />
+
+ <!-- allows for extra head tags -->
+
+
+<!-- hello -->
+<link rel="stylesheet" type="text/css"
+ href="https://fonts.googleapis.com/css?family=Open+Sans:400,400i,600">
+
+<link media="print" rel="stylesheet" type="text/css" href="/plosone/resource/css/print.css"/>
+ <script type="text/javascript">
+ var siteUrlPrefix = "/plosone/";
+ </script>
+<script src="/plosone/resource/compiled/asset_SC5JIUGEUPR4P4P6VBUINUVOVUSU3NRY.js"></script>
+
+ <link rel="shortcut icon" href="/plosone/resource/img/favicon.ico" type="image/x-icon"/>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+
+
+
+
+
+ <link rel="canonical" href="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978" />
+ <meta name="description" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers." />
+
+ <meta name="citation_abstract" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers.">
+
+
+ <meta name="keywords" content="Chickens,Antibodies,Livestock,Attenuated vaccines,Enzyme-linked immunoassays,Poultry,Animal sexual behavior,Vaccines" />
+
+
+<meta name="citation_doi" content="10.1371/journal.pone.0213978"/>
+<meta name="citation_author" content="Yang Li"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Tuanjie Wang"/>
+ <meta name="citation_author_institution" content="China Institute of Veterinary Drug Control, Beijing, China"/>
+<meta name="citation_author" content="Lin Wang"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Mingjun Sun"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Zhizhong Cui"/>
+ <meta name="citation_author_institution" content="College of Veterinary Medicine, Shandong Agricultural University, Taian, China"/>
+<meta name="citation_author" content="Shuang Chang"/>
+ <meta name="citation_author_institution" content="College of Veterinary Medicine, Shandong Agricultural University, Taian, China"/>
+<meta name="citation_author" content="Yongping Wu"/>
+ <meta name="citation_author_institution" content="College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China"/>
+<meta name="citation_author" content="Xiaodong Zhang"/>
+ <meta name="citation_author_institution" content="College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China"/>
+<meta name="citation_author" content="Xiaohui Yu"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Tao Sun"/>
+ <meta name="citation_author_institution" content="Shandong Entry-exit Inspection and Quarantine Bureau, Qingdao, China"/>
+<meta name="citation_author" content="Peng Zhao"/>
+ <meta name="citation_author_institution" content="College of Veterinary Medicine, Shandong Agricultural University, Taian, China"/>
+
+<meta name="citation_title" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"/>
+<meta itemprop="name" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"/>
+<meta name="citation_journal_title" content="PLOS ONE"/>
+<meta name="citation_journal_abbrev" content="PLOS ONE"/>
+<meta name="citation_date" content="Apr 22, 2019"/>
+<meta name="citation_firstpage" content="e0213978"/>
+<meta name="citation_issue" content="4"/>
+<meta name="citation_volume" content="14"/>
+<meta name="citation_issn" content="1932-6203"/>
+<meta name="citation_publisher" content="Public Library of Science"/>
+
+ <meta name="citation_pdf_url" content="https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable">
+
+ <meta name="citation_article_type" content="Research Article">
+
+<meta name="dc.identifier" content="10.1371/journal.pone.0213978" />
+
+
+ <meta name="twitter:card" content="summary" />
+ <meta name="twitter:site" content="@plosone"/>
+ <meta name="twitter:title" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" />
+ <meta property="twitter:description" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers." />
+ <meta property="twitter:image" content="https://journals.plos.org/plosone/article/figure/image?id=10.1371/journal.pone.0213978.t003&size=inline" />
+
+<meta property="og:type" content="article" />
+<meta property="og:url" content="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978"/>
+<meta property="og:title" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"/>
+<meta property="og:description" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers."/>
+<meta property="og:image" content="https://journals.plos.org/plosone/article/figure/image?id=10.1371/journal.pone.0213978.t003&size=inline"/>
+
+<meta name="citation_reference" content="citation_title=Occurrence of reticuloendotheliosis in Chinese partridge;citation_author=Z. Cheng;citation_author=Y. Shi;citation_author=L. Zhan;citation_author=G. Zhu;citation_author=X Diao;citation_author=Z. Cui;citation_journal_title=J Vet Med Sci;citation_volume=69;citation_number=69;citation_issue=12;citation_first_page=1295;citation_last_page=1298;citation_publication_date=2007;"/>
+<meta name="citation_reference" content="citation_title=Simultaneous endemic infections with subgroup J avian leukosis virus and reticuloendotheliosis virus in commercial and local breeds of chickens;citation_author=Z. Cui;citation_author=S. Sun;citation_author=Z. Zhang;citation_author=S. Meng;citation_journal_title=Avian Pathol;citation_volume=38;citation_number=38;citation_issue=6;citation_first_page=443;citation_last_page=448;citation_publication_date=2009;"/>
+<meta name="citation_reference" content="citation_title=Serological survey of the Reticuloendotheliosis virus infection in China native chicken flocks;citation_author=P. Zhao;citation_author=C. Ma;citation_author=Y. Du;citation_author=Z. Cui;citation_journal_title=Pak Vet J;citation_volume=32;citation_number=32;citation_first_page=621;citation_last_page=623;citation_publication_date=2012;"/>
+<meta name="citation_reference" content="citation_title=Vertical transmission of reticuloendotheliosis virus in breeder turkeys;citation_author=R.L. Witter;citation_author=D.W. Salter;citation_journal_title=Avian Dis;citation_volume=33;citation_number=33;citation_first_page=226;citation_last_page=235;citation_publication_date=1989;"/>
+<meta name="citation_reference" content="citation_title=An outbreak of lymphomas in commercial broiler breeder chickens vaccinated with a fowlpox vaccine contaminated with reticuloendotheliosis virus;citation_author=A.M. Fadly;citation_author=R.L. Witter;citation_author=E.J. Smith;citation_author=R.F. Silva;citation_author=W.M. Reed;citation_author=F.J Hoerr;citation_author=M.R. Putnam;citation_journal_title=Avian Pathol;citation_volume=25;citation_number=25;citation_issue=1;citation_first_page=35;citation_last_page=47;citation_publication_date=1996;"/>
+<meta name="citation_reference" content="citation_title=Detection of reticuloendotheliosis virus in live virus vaccines of poultry;citation_author=A Fadly;citation_author=M.C. Garcia;citation_journal_title=Dev. Biol;citation_volume=126;citation_number=126;citation_first_page=301;citation_last_page=305;citation_publication_date=2005;"/>
+<meta name="citation_reference" content="citation_title=Detection of reticuloendotheliosis virus as a contaminant of fowl pox vaccines;citation_author=A.M. Awad;citation_author=H.S. Abd El-Hamid;citation_author=A.A. Abou Rawash;citation_author=H.H. Ibrahim;citation_journal_title=Poult. Sci;citation_volume=89;citation_number=89;citation_issue=11;citation_first_page=2389;citation_last_page=2395;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Isolation, identification, and whole genome sequencing of reticuloendotheliosis virus from a vaccine against Marek’s disease;citation_author=J.P. Li;citation_author=X. Dong;citation_author=C. Yang;citation_author=Q.H. Li;citation_author=Z. Cui;citation_author=S. Chang;citation_author=P. Zhao;citation_author=K.Z Yu;citation_author=C. Yang;citation_journal_title=Poult. Sci;citation_volume=94;citation_number=94;citation_issue=4;citation_first_page=643;citation_last_page=649;citation_publication_date=2015;"/>
+<meta name="citation_reference" content="citation_title=Isolation of Reticuloendotheliosis Virus from a fowlpox live vaccine and env gene sequence analysis;citation_author=J Wang;citation_author=Z Li;citation_author=P Zhao;citation_author=H Chen;citation_author=Z Cui;citation_journal_title=Chinese Journal of Animal Infectious Diseases;citation_volume=18;citation_number=18;citation_first_page=35;citation_last_page=39;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Probable congenital transmission of reticuloendotheliosis virus caused by vaccination with contaminated vaccines;citation_author=K. Wei;citation_author=Z. Sun;citation_author=S. Zhu;citation_author=W. Guo;citation_author=P. Sheng;citation_author=P. Wang;citation_author=C. Zhao;citation_author=Q. Zhao;citation_author=R. Zhu;citation_journal_title=PLoS One;citation_volume=7;citation_number=7;citation_first_page=e43422;citation_publication_date=2012;"/>
+<meta name="citation_reference" content="citation_title=Isolation of a reticuloendotheliosis virus from chickens inoculated with Marek’s disease vaccine;citation_author=N. Yuasa;citation_author=I. Yoshida;citation_author=T. Taniguchi;citation_journal_title=Natl. Inst. Anim. Health Q;citation_volume=16;citation_number=16;citation_issue=4;citation_first_page=141;citation_last_page=151;citation_publication_date=1976;"/>
+<meta name="citation_reference" content="citation_title=Infection studies on a reticuloendotheliosis virus contaminant of a commercial Marek’s disease vaccine;citation_author=T. J. Bagust;citation_author=T. M. Grimes;citation_author=D. P. Dennett;citation_journal_title=Aust Vet J;citation_volume=55;citation_number=55;citation_issue=4;citation_first_page=153;citation_last_page=157;citation_publication_date=1979;"/>
+<meta name="citation_reference" content="citation_title=Field isolates of fowlpox virus contaminated with reticuloendotheliosis virus;citation_author=I. S. Diallo;citation_author=M. A. Mackenzie;citation_author=P. B. Spradbrow;citation_journal_title=Avian pathol;citation_volume=27;citation_number=27;citation_issue=1;citation_first_page=60;citation_last_page=66;citation_publication_date=1998;"/>
+<meta name="citation_reference" content="citation_title=Field and vaccine strains of fowlpox virus carry integrated sequences from the avian retrovirus, reticuloendotheliosis virus;citation_author=C Hertig;citation_author=B. E. Coupar;citation_author=A. R. Gould;citation_author=D.B. Boyle;citation_journal_title=Virology;citation_volume=235;citation_number=235;citation_issue=2;citation_first_page=367;citation_last_page=376;citation_publication_date=1997;"/>
+<meta name="citation_reference" content="citation_title=Reticuloendotheliosis virus (REV) long terminal repeats incorporated in the genomes of commercial fowl poxvirus vaccines and pigeon poxviruses without indication of the presence of infectious REV;citation_author=K. M. Moore;citation_author=J. R Davis;citation_author=T Sato;citation_journal_title=Avian Dis;citation_volume=44;citation_number=44;citation_issue=4;citation_first_page=827;citation_last_page=841;citation_publication_date=2000;"/>
+<meta name="citation_reference" content="citation_title=In vivo events of retroiral long terminal repeat integration into Marek’s disease virus in commerial poultry: detection of chimeric molecules as a marker;citation_author=I Davidson;citation_author=R. Borenshtain;citation_journal_title=Avian Disease;citation_volume=45;citation_number=45;citation_issue=1;citation_first_page=102;citation_last_page=121;citation_publication_date=2001;"/>
+<meta name="citation_reference" content="citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"/>
+<meta name="citation_reference" content="citation_title=A BAC clone of MDV strain GX0101 with REV-LTR integration retained its pathogenicity;citation_author=A J. Sun;citation_author=L. P. Petherbridge;citation_author=Y. G. Zhao;citation_author=Y. P. Li;citation_author=K. Nair. Venugopal;citation_author=Z.Z Cui;citation_journal_title=Chinese Science Bulletin;citation_volume=54;citation_number=54;citation_issue=15;citation_first_page=2641;citation_last_page=2647;citation_publication_date=2009;"/>
+<meta name="citation_reference" content="citation_title=Detection of fowl poxvirus integrated with reticuloendotheliosis virus sequences from an outbreak in backyard chickens in India;citation_author=S. K. Biswas;citation_author=C. Jana;citation_author=K. Chand;citation_author=W. Rehman;citation_author=B. Mondal;citation_journal_title=Vet Ital;citation_volume=47;citation_number=47;citation_issue=2;citation_first_page=147;citation_last_page=513;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Functional evaluation of the role of reticuloendotheliosis virus long terminal repeat (LTR) integrated into the genome of a field strain of Marek’s disease virus;citation_author=A. J. Sun;citation_author=X. Y. Xu;citation_author=L. Petherbridge;citation_author=Y. G. Zhao;citation_author=V. Nair;citation_author=Z. Z Cui;citation_journal_title=Virology;citation_volume=397;citation_number=397;citation_issue=2;citation_first_page=270;citation_last_page=276;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Protective efficacy of vaccination against highly pathogenic avian influenza is dramatically suppressed by early infection of chickens with reticuloendotheliosis virus;citation_author=S.H. Sun;citation_author=Z.Z. Cui;citation_author=J Wang;citation_author=Z. L Wang;citation_journal_title=Avian Pathol;citation_volume=38;citation_number=38;citation_first_page=31;citation_last_page=34;citation_publication_date=2009;"/>
+<meta name="citation_reference" content="citation_title=Depression of vaccinal immunity to Marek’s disease by infection with reticuloendotheliosis virus;citation_author=R. L. Witter;citation_author=L. F. Lee;citation_author=L. D. Bacon;citation_author=E. J. Smith;citation_journal_title=Infection and Immunity;citation_volume=26;citation_number=26;citation_first_page=90;citation_last_page=98;citation_publication_date=1979;"/>
+<meta name="citation_reference" content="citation_title=Sequencing and analysis of whole genome nucleotide sequence of Chinese REV isolate HA9901;citation_author=Y. Wang;citation_author=Z. Cui;citation_author=S. Jiang;citation_journal_title=Science in China Serices C: Life Sciences;citation_volume=35;citation_number=35;citation_first_page=340;citation_last_page=380;citation_publication_date=2005;"/>
+
+
+<!-- DoubleClick overall ad setup script -->
+<script type='text/javascript'>
+ var googletag = googletag || {};
+ googletag.cmd = googletag.cmd || [];
+ (function() {
+ var gads = document.createElement('script');
+ gads.async = true;
+ gads.type = 'text/javascript';
+ var useSSL = 'https:' == document.location.protocol;
+ gads.src = (useSSL ? 'https:' : 'http:') +
+ '//www.googletagservices.com/tag/js/gpt.js';
+ var node = document.getElementsByTagName('script')[0];
+ node.parentNode.insertBefore(gads, node);
+ })();
+</script>
+
+<!-- DoubleClick ad slot setup script -->
+
+ <script id="doubleClickSetupScript" type='text/javascript'>
+ googletag.cmd.push(function() {
+ googletag.defineSlot('/75507958/PONE_728x90_ATF', [728, 90], 'div-gpt-ad-1458247671871-0').addService(googletag.pubads());
+ googletag.defineSlot('/75507958/PONE_160x600_BTF', [160, 600], 'div-gpt-ad-1458247671871-1').addService(googletag.pubads());
+ googletag.pubads().enableSingleRequest();
+ googletag.enableServices();
+ });
+ </script>
+
+
+
+<script type="text/javascript">
+ var WombatConfig = WombatConfig || {};
+ WombatConfig.resourcePath = "/plosone/resource/";
+ WombatConfig.imgPath = "/plosone/resource/img/";
+ WombatConfig.journalKey = "PLoSONE";
+ WombatConfig.figurePath = "/plosone/article/figure/image";
+ WombatConfig.figShareInstitutionString = "plos";
+ WombatConfig.doiResolverPrefix = "https://dx.plos.org/";
+</script>
+
+<script type="text/javascript">
+ var WombatConfig = WombatConfig || {};
+ WombatConfig.metrics = WombatConfig.metrics || {};
+ WombatConfig.metrics.referenceUrl = "http://lagotto.io/plos";
+ WombatConfig.metrics.googleScholarUrl = "https://scholar.google.com/scholar";
+ WombatConfig.metrics.googleScholarCitationUrl = WombatConfig.metrics.googleScholarUrl + "?hl=en&lr=&q=";
+ WombatConfig.metrics.crossrefUrl = "https://www.crossref.org";
+</script>
+<script src="https://code.jquery.com/jquery-2.1.4.min.js" ></script>
+<script>window.jQuery || document.write('<script src="/plosone/resource/js/vendor/jquery-2.1.4.min.js""><\/script>')</script>
+
+ <script type="text/javascript" src="https://widgets.figshare.com/static/figshare.js"></script>
+
+
+
+
+
+
+
+
+
+
+
+</head>
+
+
+
+<body class="article plosone">
+
+
+
+
+
+<header>
+
+ <div id="topslot" class="head-top">
+
+<div class="center">
+<div class="title">Advertisement</div>
+<!-- DoubleClick Ad Zone -->
+ <div class='advertisement' id='div-gpt-ad-1458247671871-0' style='width:728px; height:90px;'>
+ <script type='text/javascript'>
+ googletag.cmd.push(function() { googletag.display('div-gpt-ad-1458247671871-0'); });
+ </script>
+ </div>
+</div>
+ </div>
+
+ <div id="user" class="nav">
+ <ul class="nav-user">
+
+
+
+
+ <li ><a href="https://www.plos.org">plos.org</a></li>
+
+
+ <li ><a href="https://community.plos.org/registration/new">create account</a></li>
+
+
+ <li class="highlighted"><a href="/plosone/user/secure/login?page=%2Fplosone%2Farticle%3Fid%3D10.1371%2Fjournal.pone.0213978">sign in</a></li>
+
+ </ul>
+ </div>
+ <div id="pagehdr">
+
+ <nav class="nav-main">
+
+
+
+
+<h1 class="logo">
+ <a href="/plosone/.">PLOS ONE</a>
+</h1>
+
+<section class="top-bar-section">
+
+<ul class="nav-elements">
+
+
+ <li class="multi-col-parent menu-section-header has-dropdown" id="publish">
+ Publish
+ <div class="dropdown mega ">
+ <ul class="multi-col" id="publish-dropdown-list">
+
+ <li class="menu-section-header " id="submissions">
+ <span class="menu-section-header-title"> Submissions </span>
+
+ <ul class="menu-section "
+ id="submissions-dropdown-list">
+ <li>
+ <a href="/plosone/s/getting-started" >Getting Started</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/submission-guidelines" >Submission Guidelines</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/figures" >Figures</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/tables" >Tables</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/supporting-information" >Supporting Information</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/latex" >LaTeX</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/preprints" >Preprints</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/revising-your-manuscript" >Revising Your Manuscript</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/submit-now" >Submit Now</a>
+ </li>
+
+ <li>
+ <a href="https://collections.plos.org/s/calls-for-papers" >Calls for Papers</a>
+ </li>
+
+ </ul>
+
+ </li>
+
+
+ <li class="menu-section-header " id="policies">
+ <span class="menu-section-header-title"> Policies </span>
+
+ <ul class="menu-section "
+ id="policies-dropdown-list">
+ <li>
+ <a href="/plosone/s/best-practices-in-research-reporting" >Best Practices in Research Reporting</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/human-subjects-research" >Human Subjects Research</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/animal-research" >Animal Research</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/competing-interests" >Competing Interests</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/disclosure-of-funding-sources" >Disclosure of Funding Sources</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/licenses-and-copyright" >Licenses and Copyright</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/data-availability" >Data Availability</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/materials-and-software-sharing" >Materials and Software Sharing</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/ethical-publishing-practice" >Ethical Publishing Practice</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/authorship" >Authorship</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/downloads-and-translations" >Downloads and Translations</a>
+ </li>
+
+ </ul>
+
+ </li>
+
+
+ <li class="menu-section-header " id="manuscript-review-and-publication">
+ <span class="menu-section-header-title"> Manuscript Review and Publication </span>
+
+ <ul class="menu-section "
+ id="manuscript-review-and-publication-dropdown-list">
+ <li>
+ <a href="/plosone/s/criteria-for-publication" >Criteria for Publication</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/editorial-and-peer-review-process" >Editorial and Peer Review Process</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/editor-center" >Editor Center</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/reviewer-guidelines" >Guidelines for Reviewers</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/accepted-manuscripts" >Accepted Manuscripts</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/corrections-and-retractions" >Corrections and Retractions</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/comments" >Comments</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/article-level-metrics" >Article-Level Metrics</a>
+ </li>
+
+ </ul>
+
+ </li>
+ </ul>
+ <div class="calloutcontainer">
+
+
+
+ <h3 class="callout-headline">Submit Your Manuscript</h3>
+
+ <div class="action-contain">
+ <p class="callout-content">
+ Discover a faster, simpler path to publishing in a high-quality journal. <em>PLOS ONE</em> promises fair, rigorous peer review,
+ broad scope, and wide readership – a perfect fit for your research every time.
+ </p>
+
+ <p class="button-contain special">
+ <a class="button button-default" href="/plosone/static/publish">
+ Learn More
+ </a>
+ <a class="button-link" href="https://www.editorialmanager.com/pone/default.asp">
+ Submit Now
+ </a>
+ </p>
+ </div> <!-- opens in siteMenuCalloutDescription -->
+
+
+ </div>
+ </div>
+ </li>
+
+
+
+ <li class="menu-section-header has-dropdown " id="about">
+ <span class="menu-section-header-title"> About </span>
+
+ <ul class="menu-section dropdown "
+ id="about-dropdown-list">
+ <li>
+ <a href="/plosone/static/publish" >Why Publish with PLOS ONE</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/journal-information" >Journal Information</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/staff-editors" >Staff Editors</a>
+ </li>
+
+ <li>
+ <a href="/plosone/static/editorial-board" >Editorial Board</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/section-editors" >Section Editors</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/advisory-groups" >Advisory Groups</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/find-and-read-articles" >Find and Read Articles</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/publishing-information" >Publishing Information</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/publication-fees" >Publication Fees</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/press-and-media" >Press and Media</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/contact" >Contact</a>
+ </li>
+
+ </ul>
+
+ </li>
+
+ <li data-js-tooltip-hover="trigger" class="subject-area menu-section-header">
+ Browse
+ </li>
+
+ <li id="navsearch" class="head-search">
+
+
+ <form name="searchForm" action="/plosone/search" method="get">
+ <fieldset>
+ <legend>Search</legend>
+ <label for="search">Search</label>
+ <div class="search-contain">
+ <input id="search" type="text" name="q" placeholder="SEARCH" required/>
+ <button id="headerSearchButton" type="submit"><span class="search-icon"></span></button>
+ </div>
+ </fieldset>
+ <input type="hidden" name="filterJournals" value="PLoSONE"/>
+ </form>
+
+ <a id="advSearch"
+ href="/plosone/search">
+ advanced search
+ </a>
+
+
+
+
+ </li>
+
+ </ul>
+ </section>
+ </nav>
+ </div>
+
+</header><section id="taxonomyContainer">
+
+<div id="taxonomy-browser" class="areas" data-search-url="/plosone/browse">
+ <div class="wrapper">
+ <div class="taxonomy-header">
+ Browse Subject Areas
+ <div id="subjInfo">?</div>
+ <div id="subjInfoText">
+ <p>Click through the PLOS taxonomy to find articles in your field.</p>
+ <p>For more information about PLOS Subject Areas, click
+ <a href="https://github.com/PLOS/plos-thesaurus/blob/develop/README.md" target="_blank" title="Link opens in new window">here</a>.
+ </p>
+ </div>
+ </div>
+ <div class="levels">
+ <div class="levels-container cf">
+ <div class="levels-position"></div>
+ </div>
+ <a href="#" class="prev"></a>
+ <a href="#" class="next active"></a>
+ </div>
+ </div>
+ <div class="taxonomy-browser-border-bottom"></div>
+</div></section>
+<main> <div class="set-grid">
+
+<header class="title-block">
+
+
+
+<script type="text/javascript">
+ var COUNTER_HOST = "https://counter.plos.org/api/v1.0/stats/totals/doi";
+</script>
+
+<script type="text/javascript">
+ var ALM_CONFIG = ALM_CONFIG || {};
+ ALM_CONFIG.hostname = "https://alm.plos.org";
+ ALM_CONFIG.apiKey = "3pezRBRXdyzYW6ztfwft";
+ ALM_CONFIG.host = "https://alm.plos.org/api/v5/articles";
+</script>
+
+<ul id="almSignposts" class="signposts">
+ <li id="loadingMetrics">
+ <p>Loading metrics</p>
+ </li>
+</ul>
+
+<script type="text/template" id="signpostsGeneralErrorTemplate">
+ <li id="metricsError">Article metrics are unavailable at this time. Please try again later.</li>
+</script>
+
+<script type="text/template" id="signpostsNewArticleErrorTemplate">
+ <li></li><li></li><li id="tooSoon">Article metrics are unavailable for recently published articles.</li>
+</script>
+
+<script type="text/template" id="signpostsTemplate">
+ <li id="almSaves">
+ <%= s.numberFormat(saveCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#savedHeader">Save</a>
+ <p class="saves-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#savedHeader">Total Mendeley bookmarks.</a></p>
+ </div>
+ </li>
+
+ <li id="almCitations">
+ <%= s.numberFormat(citationCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#citedHeader">Citation</a>
+ <p class="citations-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#citedHeader">Paper's citation count computed by Scopus.</a></p>
+ </div>
+ </li>
+
+ <li id="almViews">
+ <%= s.numberFormat(viewCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#viewedHeader">View</a>
+ <p class="views-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#viewedHeader">Sum of PLOS and PubMed Central page views and downloads.</a></p>
+ </div>
+ </li>
+
+ <li id="almShares">
+ <%= s.numberFormat(shareCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#discussedHeader">Share</a>
+ <p class="shares-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#discussedHeader">Sum of Facebook and Twitter activity.</a></p>
+ </div>
+ </li>
+</script>
+
+ <div class="article-meta">
+
+<div class="classifications">
+ <p class="license-short" id="licenseShort">Open Access</p>
+ <p class="peer-reviewed" id="peerReviewed">Peer-reviewed</p>
+
+<div class="article-type" >
+ <p class="type-article" id="artType">Research Article</p>
+</div>
+
+
+</div>
+
+
+ </div>
+ <div class="article-title-etc">
+
+
+
+<div class="title-authors">
+ <h1 id="artTitle"><?xml version="1.0" encoding="UTF-8"?>Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody</h1>
+
+<ul class="author-list clearfix" data-js-tooltip="tooltip_container" id="author-list">
+
+
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="0" class="author-name" >
+Yang Li <span class="contribute"> </span>,</a> <div id="author-meta-0" class="author-info" data-js-tooltip="tooltip_target">
+
+ <p>
+ <span class="contribute"> </span> Contributed equally to this work with:
+ Yang Li,
+ Tuanjie Wang
+ </p>
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology,
+
+ Project administration,
+
+ Resources,
+
+ Writing – original draft
+ </p>
+
+ <p id="authAffiliations-0"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose0"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="1" class="author-name" >
+Tuanjie Wang <span class="contribute"> </span>,</a> <div id="author-meta-1" class="author-info" data-js-tooltip="tooltip_target">
+
+ <p>
+ <span class="contribute"> </span> Contributed equally to this work with:
+ Yang Li,
+ Tuanjie Wang
+ </p>
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Project administration
+ </p>
+
+ <p id="authAffiliations-1"><span class="type">Affiliation</span>
+ China Institute of Veterinary Drug Control, Beijing, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose1"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="2" class="author-name" >
+Lin Wang,</a> <div id="author-meta-2" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-2"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose2"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="3" class="author-name" >
+Mingjun Sun,</a> <div id="author-meta-3" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Resources
+ </p>
+
+ <p id="authAffiliations-3"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose3"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="4" class="author-name" >
+Zhizhong Cui,</a> <div id="author-meta-4" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Supervision
+ </p>
+
+ <p id="authAffiliations-4"><span class="type">Affiliation</span>
+ College of Veterinary Medicine, Shandong Agricultural University, Taian, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose4"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="5" class="author-name" >
+Shuang Chang,</a> <div id="author-meta-5" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-5"><span class="type">Affiliation</span>
+ College of Veterinary Medicine, Shandong Agricultural University, Taian, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose5"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="6" class="author-name" >
+Yongping Wu,</a> <div id="author-meta-6" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-6"><span class="type">Affiliation</span>
+ College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose6"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="7" class="author-name" >
+Xiaodong Zhang,</a> <div id="author-meta-7" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-7"><span class="type">Affiliation</span>
+ College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose7"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="8" class="author-name" >
+Xiaohui Yu <span class="email"> </span>,</a> <div id="author-meta-8" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Data curation,
+
+ Investigation,
+
+ Writing – review & editing
+ </p>
+ <p id="authCorresponding-8"> <span class="email">* E-mail:</span> <a href="mailto:suntaosdciq@163.com">suntaosdciq@163.com</a> (TS); <a href="mailto:619334017@qq.com">619334017@qq.com</a> (PZ); <a href="mailto:yxhui1030@126.com">yxhui1030@126.com</a> (XY)</p>
+ <p id="authAffiliations-8"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+ <div>
+ <p class="orcid" id="authOrcid-8">
+ <span>
+ <a id="connect-orcid-link" href="http://orcid.org/0000-0003-0555-8727" target="_blank" title="ORCID Registry">
+ <img id="orcid-id-logo" src="/plosone/resource/img/orcid_16x16.png" width="16" height="16" alt="ORCID logo"/>
+ </a>
+ </span>
+ <a href="http://orcid.org/0000-0003-0555-8727">http://orcid.org/0000-0003-0555-8727</a>
+ </p>
+ </div>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose8"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="9" class="author-name" >
+Tao Sun <span class="email"> </span>,</a> <div id="author-meta-9" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Data curation
+ </p>
+ <p id="authCorresponding-9"> <span class="email">* E-mail:</span> <a href="mailto:suntaosdciq@163.com">suntaosdciq@163.com</a> (TS); <a href="mailto:619334017@qq.com">619334017@qq.com</a> (PZ); <a href="mailto:yxhui1030@126.com">yxhui1030@126.com</a> (XY)</p>
+ <p id="authAffiliations-9"><span class="type">Affiliation</span>
+ Shandong Entry-exit Inspection and Quarantine Bureau, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose9"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="10" class="author-name" >
+Peng Zhao <span class="email"> </span></a> <div id="author-meta-10" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Supervision,
+
+ Validation
+ </p>
+ <p id="authCorresponding-10"> <span class="email">* E-mail:</span> <a href="mailto:suntaosdciq@163.com">suntaosdciq@163.com</a> (TS); <a href="mailto:619334017@qq.com">619334017@qq.com</a> (PZ); <a href="mailto:yxhui1030@126.com">yxhui1030@126.com</a> (XY)</p>
+ <p id="authAffiliations-10"><span class="type">Affiliation</span>
+ College of Veterinary Medicine, Shandong Agricultural University, Taian, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose10"> &#x02A2F; </a>
+ </div>
+</li>
+
+</ul>
+
+</div>
+
+
+<div id="floatTitleTop" data-js-floater="title_author" class="float-title">
+ <div class="set-grid">
+ <div class="float-title-inner">
+ <h1><?xml version="1.0" encoding="UTF-8"?>Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody</h1>
+
+<ul id="floatAuthorList" data-js-floater="floated_authors">
+
+ <li data-float-index="1">Yang Li,&nbsp;
+
+ </li>
+ <li data-float-index="2">Tuanjie Wang,&nbsp;
+
+ </li>
+ <li data-float-index="3">Lin Wang,&nbsp;
+
+ </li>
+ <li data-float-index="4">Mingjun Sun,&nbsp;
+
+ </li>
+ <li data-float-index="5">Zhizhong Cui,&nbsp;
+
+ </li>
+ <li data-float-index="6">Shuang Chang,&nbsp;
+
+ </li>
+ <li data-float-index="7">Yongping Wu,&nbsp;
+
+ </li>
+ <li data-float-index="8">Xiaodong Zhang,&nbsp;
+
+ </li>
+ <li data-float-index="9">Xiaohui Yu,&nbsp;
+
+ </li>
+ <li data-float-index="10">Tao Sun
+
+</ul>
+
+
+
+ </div>
+ <div class="logo-close" id="titleTopCloser">
+ <img src="/plosone/resource/img/logo.plos.95.png" alt="PLOS" />
+ <div class="close-floater" title="close">x</div>
+ </div>
+ </div>
+</div>
+
+ <ul class="date-doi">
+ <li id="artPubDate">Published: April 22, 2019</li>
+ <li id="artDoi">
+<a href="https://doi.org/10.1371/journal.pone.0213978">https://doi.org/10.1371/journal.pone.0213978</a>
+ </li>
+ </ul>
+
+ </div>
+ <div>
+
+ </div>
+</header>
+
+ <section class="article-body">
+
+
+
+<ul class="article-tabs">
+
+ <li class="tab-title active" id="tabArticle">
+ <a href="/plosone/article?id=10.1371/journal.pone.0213978" class="article-tab-1">Article</a>
+ </li>
+
+
+ <li class="tab-title " id="tabAuthors">
+ <a href="/plosone/article/authors?id=10.1371/journal.pone.0213978" class="article-tab-2">Authors</a>
+ </li>
+
+
+ <li class="tab-title " id="tabMetrics">
+ <a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978" class="article-tab-3">Metrics</a>
+ </li>
+
+
+ <li class="tab-title " id="tabComments">
+ <a href="/plosone/article/comments?id=10.1371/journal.pone.0213978" class="article-tab-4">Comments</a>
+ </li>
+
+ <li class="tab-title " id="tabRelated">
+ <a href="/plosone/article/related?id=10.1371/journal.pone.0213978" class="article-tab-5">Media Coverage</a>
+ </li>
+
+</ul>
+
+ <div class="article-container">
+
+
+<div id="nav-article">
+ <ul class="nav-secondary">
+
+ <li class="nav-comments" id="nav-comments">
+ <a href="article/comments?id=10.1371/journal.pone.0213978">Reader Comments (0)</a>
+ </li>
+
+ <li class="nav-media" id="nav-media" data-doi="10.1371/journal.pone.0213978">
+ <a href="/plosone/article/related?id=10.1371/journal.pone.0213978">
+ Media Coverage <span id="media-coverage-count"></span>
+ </a>
+ </li>
+
+ <li id="nav-figures"><a href="#" data-doi="10.1371/journal.pone.0213978">Figures</a></li>
+ </ul>
+</div>
+
+<div id="figure-lightbox-container"></div>
+
+<script id="figure-lightbox-template" type="text/template">
+ <div id="figure-lightbox" class="reveal-modal full" data-reveal aria-hidden="true"
+ role="dialog">
+ <div class="lb-header">
+ <h1 id="lb-title"><%= articleTitle %></h1>
+
+ <div id="lb-authors">
+ <span>Yang Li</span>
+ <span>Tuanjie Wang</span>
+ <a class="more-authors" href="/plosone/article/authors?id=10.1371/journal.pone.0213978">...</a>
+ <span>Peng Zhao</span>
+ </div>
+
+ <div class="lb-close" title="close">&nbsp;</div>
+ </div>
+ <div class="img-container">
+ <div class="loader"> <i class="fa-spinner"></i> </div>
+ <img class="main-lightbox-image" src=""/>
+ <aside id="figures-list">
+ <% figureList.each(function (ix, figure) { %>
+ <div class="change-img" data-doi="<%= figure.getAttribute('data-doi') %>">
+ <img class="aside-figure" src="/plosone/article/figure/image?size=inline&id=<%= figure.getAttribute('data-doi') %>" />
+ </div>
+ <% }) %>
+ <div class="dummy-figure">
+ </div>
+ </aside>
+ </div>
+ <div id="lightbox-footer">
+
+ <div id="btns-container" class="lightbox-row <% if(figureList.length <= 1) { print('one-figure-only') } %>">
+ <div class="fig-btns-container reset-zoom-wrapper left">
+ <span class="fig-btn reset-zoom-btn">Reset zoom</span>
+ </div>
+ <div class="zoom-slider-container">
+ <div class="range-slider-container">
+ <span id="lb-zoom-min"></span>
+ <div class="range-slider round" data-slider data-options="start: 20; end: 200; initial: 20;">
+ <span class="range-slider-handle" role="slider" tabindex="0"></span>
+ <span class="range-slider-active-segment"></span>
+ <input type="hidden">
+ </div>
+ <span id="lb-zoom-max"></span>
+ </div>
+ </div>
+ <% if(figureList.length > 1) { %>
+ <div class="fig-btns-container">
+ <span class="fig-btn all-fig-btn"><i class="icon icon-all"></i> All Figures</span>
+ <span class="fig-btn next-fig-btn"><i class="icon icon-next"></i> Next</span>
+ <span class="fig-btn prev-fig-btn"><i class="icon icon-prev"></i> Previous</span>
+ </div>
+ <% } %>
+ </div>
+ <div id="image-context">
+ </div>
+ </div>
+ </div>
+</script>
+
+<script id="image-context-template" type="text/template">
+ <div class="footer-text">
+ <div id="figure-description-wrapper">
+ <div id="view-more-wrapper" style="<% descriptionExpanded? print('display:none;') : '' %>">
+ <span id="figure-title"><%= title %></span>
+ <p id="figure-description">
+ <%= description %>&nbsp;&nbsp;
+ </p>
+ <span id="view-more">show more<i class="icon-arrow-right"></i></span>
+
+ </div>
+ <div id="view-less-wrapper" style="<% descriptionExpanded? print('display:inline-block;') : '' %>" >
+ <span id="figure-title"><%= title %></span>
+ <p id="full-figure-description">
+ <%= description %>&nbsp;&nbsp;
+ <span id="view-less">show less<i class="icon-arrow-left"></i></span>
+ </p>
+ </div>
+ </div>
+ </div>
+ <div id="show-context-container">
+ <a class="btn show-context" href="<%= showInContext(strippedDoi) %>">Show in Context</a>
+ </div>
+ <div id="download-buttons">
+ <h3>Download:</h3>
+ <div class="item">
+ <a href="/plosone/article/figure/image?size=original&download=&id=<%= doi %>" title="original image">
+ <span class="download-btn">TIFF</span>
+ </a>
+ <span class="file-size"><%= fileSizes.original %></span>
+ </div>
+ <div class="item">
+ <a href="/plosone/article/figure/image?size=large&download=&id=<%= doi %>" title="large image">
+ <span class="download-btn">PNG</span>
+ </a>
+ <span class="file-size"><%= fileSizes.large %></span>
+ </div>
+ <div class="item">
+ <a href="/plosone/article/figure/powerpoint?id=<%= doi %>" title="PowerPoint slide">
+ <span class="download-btn">PPT</span>
+ </a>
+ </div>
+
+ </div>
+</script>
+ <div class="article-content">
+
+
+
+
+
+
+<div id="figure-carousel-section">
+ <h2>Figures</h2>
+
+ <div id="figure-carousel">
+
+ <div class="carousel-wrapper">
+ <div class="slider">
+
+ <div class="carousel-item lightbox-figure" data-doi="10.1371/journal.pone.0213978.t001">
+
+ <img src="/plosone/article/figure/image?size=inline&amp;id=10.1371/journal.pone.0213978.t001"
+ alt="Table 1"
+ />
+
+ </div>
+
+ <div class="carousel-item lightbox-figure" data-doi="10.1371/journal.pone.0213978.t002">
+
+ <img src="/plosone/article/figure/image?size=inline&amp;id=10.1371/journal.pone.0213978.t002"
+ alt="Table 2"
+ />
+
+ </div>
+
+ <div class="carousel-item lightbox-figure" data-doi="10.1371/journal.pone.0213978.t003">
+
+ <img src="/plosone/article/figure/image?size=inline&amp;id=10.1371/journal.pone.0213978.t003"
+ alt="Table 3"
+ />
+
+ </div>
+ </div>
+ </div>
+
+ <div class="carousel-control">
+ <span class="button previous"></span>
+ <span class="button next"></span>
+ </div>
+ <div class="carousel-page-buttons">
+
+ </div>
+ </div>
+</div>
+
+
+ <div class="article-text" id="artText">
+
+
+
+
+<div class="abstract toc-section"><a id="abstract0" name="abstract0" data-toc="abstract0" class="link-target" title="Abstract"></a><h2>Abstract</h2><a id="article1.front1.article-meta1.abstract1.p1" name="article1.front1.article-meta1.abstract1.p1" class="link-target"></a><p>Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers.</p>
+</div>
+
+
+<div class="articleinfo"><p><strong>Citation: </strong>Li Y, Wang T, Wang L, Sun M, Cui Z, Chang S, et al. (2019) Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody. PLoS ONE 14(4):
+ e0213978.
+
+ https://doi.org/10.1371/journal.pone.0213978</p><p><strong>Editor: </strong>Eric HY Lau, The University of Hong Kong, CHINA</p><p><strong>Received: </strong>June 22, 2018; <strong>Accepted: </strong>March 5, 2019; <strong>Published: </strong> April 22, 2019</p><p><strong>Copyright: </strong> © 2019 Li et al. This is an open access article distributed under the terms of the <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</a>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</p><p><strong>Data Availability: </strong>All relevant data are within the manuscript.</p><p><strong>Funding: </strong>The research was supported by the National Quality Infrastructure of China (2017YFF0210200).The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p><p><strong>Competing interests: </strong> The authors have declared that no competing interests exist.</p></div>
+
+
+
+
+
+<div id="section1" class="section toc-section"><a id="sec001" name="sec001" data-toc="sec001" class="link-target" title="Introduction"></a><h2>Introduction</h2><a id="article1.body1.sec1.p1" name="article1.body1.sec1.p1" class="link-target"></a><p>Avian reticuloendotheliosis virus (REV) is one of the most important pathogens that can cause avian tumors. Recently, epidemiological investigations showed that REV infection is very common in Chinese chickens, particularly in local poultry species [<a href="#pone.0213978.ref001" class="ref-tip">1</a>–<a href="#pone.0213978.ref003" class="ref-tip">3</a>]. As REV can be vertically transmitted through hatching eggs [<a href="#pone.0213978.ref004" class="ref-tip">4</a>], if REV-contaminated eggs are used to produce attenuated vaccines, vaccines can be contaminated by REV, which represents one of the crucial ways to disseminate REV [<a href="#pone.0213978.ref005" class="ref-tip">5</a>–<a href="#pone.0213978.ref007" class="ref-tip">7</a>]. Recently in China, the use of REV-contaminated attenuated vaccines is considered to be an important cause of REV infection [<a href="#pone.0213978.ref008" class="ref-tip">8</a>–<a href="#pone.0213978.ref010" class="ref-tip">10</a>].</p>
+<a id="article1.body1.sec1.p2" name="article1.body1.sec1.p2" class="link-target"></a><p>To overcome this problem, as the Ministry of Agriculture of China stipulated, all attenuated poultry vaccines must use SPF chickens as raw materials to produce attenuated vaccines, and all vaccine producers must confirm whether SPF chickens are infected by REV or not using sampled serum antibody detection. However, because of the specificity of housing standards in SPF poultry farms, others cannot freely enter a breeding area for sampling and detection. In this current study, we attempted to replace antibody detection in serum with antibody detection in egg yolks of SPF chickens.</p>
+</div>
+
+<div id="section2" class="section toc-section"><a id="sec002" name="sec002" data-toc="sec002" class="link-target" title="Results"></a><h2>Results</h2>
+<div id="section1" class="section toc-section"><a id="sec003" name="sec003" class="link-target" title="Determination of the optimal yolk dilution"></a>
+<h3>Determination of the optimal yolk dilution</h3>
+<a id="article1.body1.sec2.sec1.p1" name="article1.body1.sec2.sec1.p1" class="link-target"></a><p>Under the same conditions, we measured REV antibody titers in paired yolk and serum samples collected on the same day or one day before or after in 40 SPF chickens during the initial egg-laying stage when the chickens were 23 weeks old. <a href="#pone-0213978-t001">Table 1</a> shows the “goodness of fit†between yolk antibody titers diluted to various concentrations and serum antibody titers at the required concentration. By comparison, we found that REV antibody detection in the yolk at a 1:300 dilution had the highest goodness of fit with serum antibody measurements, and reached 97.5%.</p>
+<a class="link-target" id="pone-0213978-t001" name="pone-0213978-t001"></a><div class="figure" data-doi="10.1371/journal.pone.0213978.t001"><div class="img-box"><a title="Click for larger image" href="article/figure/image?size=medium&amp;id=info:doi/10.1371/journal.pone.0213978.t001" data-doi="info:doi/10.1371/journal.pone.0213978" data-uri="info:doi/10.1371/journal.pone.0213978.t001"><img src="article/figure/image?size=inline&amp;id=info:doi/10.1371/journal.pone.0213978.t001" alt="thumbnail" class="thumbnail"></a><div class="expand"></div></div><div class="figure-inline-download">
+ Download:
+ <ul><li><div class="definition-label"><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t001">
+ PPT
+ </a></div><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t001">
+ PowerPoint slide
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ PNG
+ </a></div><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ larger image
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ TIFF
+ </a></div><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ original image
+ </a></li></ul></div><div class="figcaption"><span>Table 1. </span> Consistent yolk and serum antibody measurements with different dilutions of yolk.</div><p class="caption_target"></p><p class="caption_object"><a href="https://doi.org/10.1371/journal.pone.0213978.t001">
+ https://doi.org/10.1371/journal.pone.0213978.t001</a></p></div></div>
+
+<div id="section2" class="section toc-section"><a id="sec004" name="sec004" class="link-target" title="Comparison of the goodness of fit for ALV-Ab antibody measurements in serum and yolk from SPF chickens of different ages"></a>
+<h3>Comparison of the goodness of fit for ALV-Ab antibody measurements in serum and yolk from SPF chickens of different ages</h3>
+<a id="article1.body1.sec2.sec2.p1" name="article1.body1.sec2.sec2.p1" class="link-target"></a><p>In 25–34-week-old chickens, serum and hatching eggs were sampled once per week, and a total of 720 serum samples and 720 yolk samples were collected from 40 SPF infected chickens and 32 SPF chickens without virus challenge. <a href="#pone-0213978-t002">Table 2</a> showed that the yolk antibody findings were completely consistent with those based on serum antibody detection within 10 weeks, as the serum antibody-positive chickens were all yolk antibody-positive, and the serum antibody-negative chickens were all yolk antibody-negative. Additionally, 35 of 40 SPF chickens challenged with REV alone were always REV antibody-positive in the serum and yolk, while 4 were always REV antibody-negative. All 32 SPF chickens without virus challenge were always REV antibody-positive in the serum and yolk. The goodness of fit for serum antibody and yolk antibody detection reached 100%.</p>
+<a class="link-target" id="pone-0213978-t002" name="pone-0213978-t002"></a><div class="figure" data-doi="10.1371/journal.pone.0213978.t002"><div class="img-box"><a title="Click for larger image" href="article/figure/image?size=medium&amp;id=info:doi/10.1371/journal.pone.0213978.t002" data-doi="info:doi/10.1371/journal.pone.0213978" data-uri="info:doi/10.1371/journal.pone.0213978.t002"><img src="article/figure/image?size=inline&amp;id=info:doi/10.1371/journal.pone.0213978.t002" alt="thumbnail" class="thumbnail"></a><div class="expand"></div></div><div class="figure-inline-download">
+ Download:
+ <ul><li><div class="definition-label"><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t002">
+ PPT
+ </a></div><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t002">
+ PowerPoint slide
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ PNG
+ </a></div><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ larger image
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ TIFF
+ </a></div><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ original image
+ </a></li></ul></div><div class="figcaption"><span>Table 2. </span> Agreement of yolk and serum antibody measurements with different dilutions of yolk.</div><p class="caption_target"></p><p class="caption_object"><a href="https://doi.org/10.1371/journal.pone.0213978.t002">
+ https://doi.org/10.1371/journal.pone.0213978.t002</a></p></div></div>
+
+<div id="section3" class="section toc-section"><a id="sec005" name="sec005" class="link-target" title="REV antibody detection in serum and yolk from different SPF chicken populations"></a>
+<h3>REV antibody detection in serum and yolk from different SPF chicken populations</h3>
+<a id="article1.body1.sec2.sec3.p1" name="article1.body1.sec2.sec3.p1" class="link-target"></a><p>A total of 1000 yolk samples and 1000 serum samples from 10 different SPF chicken populations were detected for REV antibody. <a href="#pone-0213978-t003">Table 3</a> showed that all samples tested were negative based on yolk and serum antibody detection. Our evaluation results were consistent and without false positive results, indicating that the test SPF chicken populations were not infected by REV.</p>
+<a class="link-target" id="pone-0213978-t003" name="pone-0213978-t003"></a><div class="figure" data-doi="10.1371/journal.pone.0213978.t003"><div class="img-box"><a title="Click for larger image" href="article/figure/image?size=medium&amp;id=info:doi/10.1371/journal.pone.0213978.t003" data-doi="info:doi/10.1371/journal.pone.0213978" data-uri="info:doi/10.1371/journal.pone.0213978.t003"><img src="article/figure/image?size=inline&amp;id=info:doi/10.1371/journal.pone.0213978.t003" alt="thumbnail" class="thumbnail"></a><div class="expand"></div></div><div class="figure-inline-download">
+ Download:
+ <ul><li><div class="definition-label"><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t003">
+ PPT
+ </a></div><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t003">
+ PowerPoint slide
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ PNG
+ </a></div><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ larger image
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ TIFF
+ </a></div><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ original image
+ </a></li></ul></div><div class="figcaption"><span>Table 3. </span> Detection of REV antibody from 10 SPF chicken flocks in China (random collection).</div><p class="caption_target"></p><p class="caption_object"><a href="https://doi.org/10.1371/journal.pone.0213978.t003">
+ https://doi.org/10.1371/journal.pone.0213978.t003</a></p></div></div>
+</div>
+
+<div id="section3" class="section toc-section"><a id="sec006" name="sec006" data-toc="sec006" class="link-target" title="Discussion"></a><h2>Discussion</h2><a id="article1.body1.sec3.p1" name="article1.body1.sec3.p1" class="link-target"></a><p>Recently, epidemiological surveys have shown that different Chinese chicken populations are frequently infected by REV, especially in local Chinese chicken species [<a href="#pone.0213978.ref001" class="ref-tip">1</a>–<a href="#pone.0213978.ref003" class="ref-tip">3</a>]. To control REV infection, many measures have been employed, including the use of attenuated vaccines without REV contamination. In China and other countries, the possibility of REV contamination in attenuated poultry vaccines has been a major concern for many years. Many REV infections are thought to be caused by REV infection in contaminated attenuated vaccines, particularly for the most frequently used fowlpox virus vaccine (FPV) and anti-Marek’s Disease vaccines [<a href="#pone.0213978.ref005" class="ref-tip">5</a>–<a href="#pone.0213978.ref013" class="ref-tip">13</a>]. Additionally, the capability of REV to integrate into the genome of other viruses complicates its diagnosis and prevention [<a href="#pone.0213978.ref014" class="ref-tip">14</a>–<a href="#pone.0213978.ref021" class="ref-tip">21</a>]. Awad <em>et al</em>. detected REV in contaminated FPV vaccine using PCR identification and REV antibody detection for virus isolation and identification in vaccinated SPF chickens[<a href="#pone.0213978.ref007" class="ref-tip">7</a>]. REV contamination in avian attenuated vaccines can lead to serious consequences, such as a significant reduction in antibody levels in vaccine-immunized chicken populations[<a href="#pone.0213978.ref022" class="ref-tip">22</a>].</p>
+<a id="article1.body1.sec3.p2" name="article1.body1.sec3.p2" class="link-target"></a><p>The REV contamination in attenuated vaccines may occur during the production process, but the use of REV-contaminated chicken embryos as raw materials is always the main cause. The national standards of China specify that vaccine production enterprises or SPF chicken breeding manufactures must periodically measure REV antibody levels in SPF chicken serum to evaluate the REV cleanliness in specific flocks. Because of differences in SPF chicken breeding environments, other individuals should not be allowed to enter a SPF chicken breeding area for sampling. This current approach causes both stress responses in SPF chickens and introduces the risk of false results for SPF chicken serum tests resulting from the inspection process. Therefore, the Ministry of Agriculture of China asked whether yolk antibody detection in hatching eggs could be used as a substitute for serum antibody detection to evaluate exogenous virus contamination in SPF chicken embryos.</p>
+<a id="article1.body1.sec3.p3" name="article1.body1.sec3.p3" class="link-target"></a><p>The yolk dilution has a strong influence on the antibody detection results, as excessive high yolk concentration is prone to yield false negative or false positive results. The results of this present study showed that yolk at a 1:300 dilution gave the best goodness of fit between the antibody-negative or positive results based on yolk or serum antibody detection. To precisely and scientifically reveal the correlation between the yolk and serum antibody detection, we compared REV antibody detection results in the yolk and serum of 72 SPF chickens (40 were inoculated with REV one month prior to egg-laying) for 10 consecutive weeks. We found that for the 72 chickens, serum antibody detection results coincided with yolk antibody results at a rate of 100%. Our findings indicate that it is feasible to replace serum antibody tests with yolk antibody detection to monitor REV infection in SPF chickens.</p>
+<a id="article1.body1.sec3.p4" name="article1.body1.sec3.p4" class="link-target"></a><p>At the optimal dilution determined in this study, a total of 1000 yolk samples and 1000 serum samples from 10 separate SPF chicken populations were tested for REV antibodies, and all showed negative results. The results of undetected antibodies showed that these chickens were not infected with REV or that although these chickens were infected with REV, not enough antibodies were detected. In order to avoid the false negative, we consider that chickens repeatedly tested negatively are not infected with REV, which is very important in flock surveillance. Additionally, detection results that used both methods were fully consistent. Importantly, no false positive results were obtained. These robust results indicate that contemporary SPF chicken embryos in China are mostly or fully not contaminated by REV. Our findings suggest that vaccine production enterprises could evaluate the REV cleanliness of SPF chicken farms by detecting antibodies in the yolk of SPF eggs. This process not only reduces the stress responses of SPF chickens during serum sampling and provides convenience for sampling, it also yields more reliable samples. Indeed, compared with serum sample results, hatching egg-based data are less prone to human error.</p>
+</div>
+
+<div id="section4" class="section toc-section"><a id="sec007" name="sec007" data-toc="sec007" class="link-target" title="Materials and methods"></a><h2>Materials and methods</h2>
+<div id="section1" class="section toc-section"><a id="sec008" name="sec008" class="link-target" title="REV strain"></a>
+<h3>REV strain</h3>
+<a id="article1.body1.sec4.sec1.p1" name="article1.body1.sec4.sec1.p1" class="link-target"></a><p>The strain REV-HA9901 was isolated in 1999 and full-length genomic sequencing had been completed (GenBank Accession No. AY842951) [<a href="#pone.0213978.ref023" class="ref-tip">23</a>]. Supernatants of the pre-frozen virus cells at –80°C were used to calculate TCID<sub>50</sub> by the Karber method; 0.1 mL supernatant of CEF cells contained 10 <sup>4.5</sup> TCID<sub>50</sub>.</p>
+</div>
+
+<div id="section2" class="section toc-section"><a id="sec009" name="sec009" class="link-target" title="Rearing and virus challenge of SPF chickens"></a>
+<h3>Rearing and virus challenge of SPF chickens</h3>
+<a id="article1.body1.sec4.sec2.p1" name="article1.body1.sec4.sec2.p1" class="link-target"></a><p>A total of 40 nineteen-week-old SPF chickens were purchased from SPAFAS Poultry Co., and were reared in HEPA-filtered negative-pressure isolators. At nineteen weeks of age, groups of 13, 14, and 13 chickens were vaccinated with 10<sup>3</sup> TCID<sub>50</sub> HA9901, 10<sup>4</sup> TCID<sub>50</sub> HA9901, and 10<sup>5</sup> TCID<sub>50</sub> of HA9901, respectively. All labeled chickens were separately raised within a single cage in an SPF animal feeding unit so that eggs and serum samples could corresponded 1:1 with chickens. A total of 32 SPF chickens in the same batch were reared in isolation environments as a negative control. All these chickens from each group were sacrificed by intravenous administration of barbiturates. The use of all laboratory animals in this study was approved by the scientific ethical committee of Shandong province.</p>
+</div>
+
+<div id="section3" class="section toc-section"><a id="sec010" name="sec010" class="link-target" title="Determination of the optimal yolk dilution"></a>
+<h3>Determination of the optimal yolk dilution</h3>
+<a id="article1.body1.sec4.sec3.p1" name="article1.body1.sec4.sec3.p1" class="link-target"></a><p>The 40 inoculated SPF chickens all began laying eggs when 23-weeks-old, and the hatching eggs and serum samples were collected from each chicken. Serum samples were diluted to the optimal concentration in accordance with the instructions of the ELISA test kit for REV antibody (IDEXX Company); and yolk samples were diluted to 1:150, 1:200, 1:300, and 1:400. To minimize the possibility of human errors, paired serum and yolk from each chicken were tested using the same kit by the same laboratory staff in simultaneous ELISA experiments with identical conditions. Each sample was tested twice, and if the two values differed greatly the test was repeated. Based on these results, we determined the optimal dilution of yolk at which the detection was in accordance with that determined based on serum antibody detection.</p>
+</div>
+
+<div id="section4" class="section toc-section"><a id="sec011" name="sec011" class="link-target" title="REV antibody detection in serum and yolk among chickens of different ages"></a>
+<h3>REV antibody detection in serum and yolk among chickens of different ages</h3>
+<a id="article1.body1.sec4.sec4.p1" name="article1.body1.sec4.sec4.p1" class="link-target"></a><p>Each week, paired egg and serum samples from each chicken were collected from 72 SPF chickens for 10 weeks from the age of 25 to 34 weeks old. If a chicken did not lay eggs on the blood-collecting day, the egg laid one day before or after the blood collection was used. For REV antibody detection, serum samples were diluted according to the manufacturer’s instructions and yolk samples were diluted in accord with the optimal dilution determined in Section 1.3. To minimize the possibility of human errors, paired serum and yolk from each chicken were tested using the same batch of kits by the same laboratory staff in simultaneous ELISA experiments with identical conditions. Each sample was tested twice, and if the two values differed greatly, tests were repeated. Finally, we compared the “goodness of fit†between the yolk antibody sampled during different stages and serum antibody measurements.</p>
+</div>
+
+<div id="section5" class="section toc-section"><a id="sec012" name="sec012" class="link-target" title="REV antibody detection in the serum and yolk of different SPF chicken populations"></a>
+<h3>REV antibody detection in the serum and yolk of different SPF chicken populations</h3>
+<a id="article1.body1.sec4.sec5.p1" name="article1.body1.sec4.sec5.p1" class="link-target"></a><p>Paired egg and serum samples from each chicken were sampled from 10 distinct Chinese SPF chicken populations. Serum samples were diluted in accordance with the test kit manufacturer’s instructions (IDEXX Company), and yolk samples were diluted in accordance with the optimal dilution that was determined. We separately estimated the REV cleanliness for different SPF chicken populations based on the two previously described examination methods, and compared differences in the actual operation. To minimize the introduction of human errors, paired serum and yolk samples from a chicken were tested using the same batch of kits by the same laboratory staff in simultaneous ELISA experiments with identical conditions. Each sample was tested twice, and if the two values differed greatly the tests were repeated.</p>
+</div>
+</div>
+
+
+
+
+
+<div class="section toc-section"><a id="ack" name="ack" data-toc="ack" title="Acknowledgments" class="link-target"></a><h2>Acknowledgments</h2>
+<a id="article1.back1.ack1.p1" name="article1.back1.ack1.p1" class="link-target"></a><p>The research was supported by the National Quality Infrastructure of China (2017YFF0210200).</p>
+</div><div class="toc-section"><a id="references" name="references" class="link-target" data-toc="references" title="References"></a><h2>References</h2><ol class="references"><li id="ref1"><span class="order">1.
+ </span><a name="pone.0213978.ref001" id="pone.0213978.ref001" class="link-target"></a>Cheng Z., Shi Y., Zhan L., Zhu G., Diao X. and Cui Z. (2007) Occurrence of reticuloendotheliosis in Chinese partridge. J Vet Med Sci. 69(12): 1295–1298. pmid:18176029 <ul class="reflinks"><li><a href="#" data-author="Cheng" data-cit="ChengZ.%2C%20ShiY.%2C%20ZhanL.%2C%20ZhuG.%2C%20DiaoX.%20and%20CuiZ.%20%282007%29%20Occurrence%20of%20reticuloendotheliosis%20in%20Chinese%20partridge.%20J%20Vet%20Med%20Sci.%2069%2812%29%3A%201295%E2%80%931298.%2018176029" data-title="Occurrence%20of%20reticuloendotheliosis%20in%20Chinese%20partridge" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://www.ncbi.nlm.nih.gov/pubmed/18176029" target="_new" title="Go to article in PubMed">
+ PubMed/NCBI
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Occurrence+of+reticuloendotheliosis+in+Chinese+partridge+Cheng+2007" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref2"><span class="order">2.
+ </span><a name="pone.0213978.ref002" id="pone.0213978.ref002" class="link-target"></a>Cui Z., Sun S., Zhang Z., and Meng S. (2009) Simultaneous endemic infections with subgroup J avian leukosis virus and reticuloendotheliosis virus in commercial and local breeds of chickens. Avian Pathol. 38(6): 443–448. pmid:19937533 <ul class="reflinks" data-doi="10.1080/03079450903349188"><li><a href="https://doi.org/10.1080/03079450903349188" data-author="doi-provided" data-cit="doi-provided" data-title="doi-provided" target="_new" title="Go to article">
+ View Article
+ </a></li><li><a href="http://www.ncbi.nlm.nih.gov/pubmed/19937533" target="_new" title="Go to article in PubMed">
+ PubMed/NCBI
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Simultaneous+endemic+infections+with+subgroup+J+avian+leukosis+virus+and+reticuloendotheliosis+virus+in+commercial+and+local+breeds+of+chickens+Cui+2009" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref3"><span class="order">3.
+ </span><a name="pone.0213978.ref003" id="pone.0213978.ref003" class="link-target"></a>Zhao P., Ma C., Du Y., and Cui Z. (2012) Serological survey of the Reticuloendotheliosis virus infection in China native chicken flocks. Pak Vet J. 32:621–623. <ul class="reflinks"><li><a href="#" data-author="Zhao" data-cit="ZhaoP.%2C%20MaC.%2C%20DuY.%2C%20and%20CuiZ.%20%282012%29%20Serological%20survey%20of%20the%20Reticuloendotheliosis%20virus%20infection%20in%20China%20native%20chicken%20flocks.%20Pak%20Vet%20J.%2032%3A621%E2%80%93623." data-title="Serological%20survey%20of%20the%20Reticuloendotheliosis%20virus%20infection%20in%20China%20native%20chicken%20flocks" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Serological+survey+of+the+Reticuloendotheliosis+virus+infection+in+China+native+chicken+flocks+Zhao+2012" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref22"><span class="order">22.
+ </span><a name="pone.0213978.ref022" id="pone.0213978.ref022" class="link-target"></a>Witter R. L., Lee L. F., Bacon L. D. and Smith E. J. (1979) Depression of vaccinal immunity to Marek’s disease by infection with reticuloendotheliosis virus. Infection and Immunity. 26:90–98. pmid:227800 <ul class="reflinks"><li><a href="#" data-author="Witter" data-cit="WitterR.%20L.%2C%20LeeL.%20F.%2C%20BaconL.%20D.%20and%20SmithE.%20J.%20%281979%29%20Depression%20of%20vaccinal%20immunity%20to%20Marek%E2%80%99s%20disease%20by%20infection%20with%20reticuloendotheliosis%20virus.%20Infection%20and%20Immunity.%2026%3A90%E2%80%9398.%20227800" data-title="Depression%20of%20vaccinal%20immunity%20to%20Marek%E2%80%99s%20disease%20by%20infection%20with%20reticuloendotheliosis%20virus" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://www.ncbi.nlm.nih.gov/pubmed/227800" target="_new" title="Go to article in PubMed">
+ PubMed/NCBI
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Depression+of+vaccinal+immunity+to+Marek%E2%80%99s+disease+by+infection+with+reticuloendotheliosis+virus+Witter+1979" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref23"><span class="order">23.
+ </span><a name="pone.0213978.ref023" id="pone.0213978.ref023" class="link-target"></a>Wang Y., Cui Z. and Jiang S. (2005) Sequencing and analysis of whole genome nucleotide sequence of Chinese REV isolate HA9901. Science in China Serices C: Life Sciences. 35:340–380. <ul class="reflinks"><li><a href="#" data-author="Wang" data-cit="WangY.%2C%20CuiZ.%20and%20JiangS.%20%282005%29%20Sequencing%20and%20analysis%20of%20whole%20genome%20nucleotide%20sequence%20of%20Chinese%20REV%20isolate%20HA9901.%20Science%20in%20China%20Serices%20C%3A%20Life%20Sciences.%2035%3A340%E2%80%93380." data-title="Sequencing%20and%20analysis%20of%20whole%20genome%20nucleotide%20sequence%20of%20Chinese%20REV%20isolate%20HA9901" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Sequencing+and+analysis+of+whole+genome+nucleotide+sequence+of+Chinese+REV+isolate+HA9901+Wang+2005" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li></ol></div>
+
+
+
+ <div class="ref-tooltip">
+ <div class="ref_tooltip-content">
+
+ </div>
+ </div>
+
+ </div>
+ </div>
+ </div>
+
+ </section>
+ <aside class="article-aside">
+
+
+<!--[if IE 9]>
+<style>
+.dload-xml {margin-top: 38px}
+</style>
+<![endif]-->
+<div class="dload-menu">
+ <div class="dload-pdf">
+ <a href="/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ id="downloadPdf" target="_blank">Download PDF</a>
+ </div>
+ <div data-js-tooltip-hover="trigger" class="dload-hover">&nbsp;
+ <ul class="dload-xml" data-js-tooltip-hover="target">
+ <li><a href="/plosone/article/citation?id=10.1371/journal.pone.0213978"
+ id="downloadCitation">Citation</a></li>
+ <li><a href="/plosone/article/file?id=10.1371/journal.pone.0213978&type=manuscript"
+ id="downloadXml">XML</a>
+ </li>
+ </ul>
+
+ </div>
+</div>
+
+<div class="aside-container">
+
+<div class="print-article" id="printArticle" data-js-tooltip-hover="trigger">
+
+ Print
+ <ul class="print-options" data-js-tooltip-hover="target">
+ <li>
+ <a href="#" onclick="window.print(); return false;" class="preventDefault" id="printBrowser" title="Print
+ Article">Print article</a>
+ </li>
+
+
+
+
+<li>
+<a title="Odyssey Press" href="https://www.odysseypress.com/onlinehost/reprint_order.php?type=A&amp;page=0&amp;journal=7&amp;doi=10.1371%2Fjournal.pone.0213978&amp;volume=&amp;issue=&amp;title=Assessment%20on%20reticuloendotheliosis%20virus%20infection%20in%20specific-pathogen-free%20chickens%20based%20on%20detection%20of%20yolk%20antibody&amp;author_name=Yang%20Li%2C%20Tuanjie%20Wang%2C%20Lin%20Wang%2C%20Mingjun%20Sun%2C%20Zhizhong%20Cui%2C%20Shuang%20Chang%2C%20Yongping%20Wu%2C%20Xiaodong%20Zhang%2C%20Xiaohui%20Yu%2C%20Tao%20Sun%2C%20Peng%20Zhao&amp;start_page=1&amp;end_page=7">EzReprint </a>
+</li>
+
+ </ul>
+</div>
+<div class="share-article" id="shareArticle" data-js-tooltip-hover="trigger">
+ Share
+ <ul data-js-tooltip-hover="target" class="share-options" id="share-options">
+
+<li><a href="https://www.reddit.com/submit?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareReddit" target="_blank" title="Submit to Reddit"><img src="/plosone/resource/img/icon.reddit.16.png" width="16" height="16" alt="Reddit">Reddit</a></li>
+
+<li><a href="https://plus.google.com/share?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareGoogle" target="_blank" title="Share on Google+"><img src="/plosone/resource/img/icon.gplus.16.png" width="16" height="16" alt="Google+">Google+</a></li>
+
+<li><a href="https://www.facebook.com/share.php?u=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978&t=Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" id="shareFacebook" target="_blank" title="Share on Facebook"><img src="/plosone/resource/img/icon.fb.16.png" width="16" height="16" alt="Facebook">Facebook</a></li>
+
+<li><a href="https://www.linkedin.com/shareArticle?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978&title=Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody&summary=Checkout this article I found at PLOS" id="shareLinkedIn" target="_blank" title="Add to LinkedIn"><img src="/plosone/resource/img/icon.linkedin.16.png" width="16" height="16" alt="LinkedIn">LinkedIn</a></li>
+
+<li><a href="https://www.mendeley.com/import/?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareMendeley" target="_blank" title="Add to Mendeley"><img src="/plosone/resource/img/icon.mendeley.16.png" width="16" height="16" alt="Mendeley">Mendeley</a></li>
+
+<li><a href="https://www.pubchase.com/library?add_aid=10.1371/journal.pone.0213978&source=plos" id="sharePubChase" target="_blank" title="Add to PubChase"><img src="/plosone/resource/img/icon.pc.16.png" width="16" height="16" alt="PubChase">PubChase</a></li>
+
+ <li><a href="https://twitter.com/intent/tweet?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978&text=%23PLOSONE%3A%20Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" target="_blank" title="share on Twitter" id="twitter-share-link"><img src="/plosone/resource/img/icon.twtr.16.png" width="16" height="16" alt="Twitter">Twitter</a></li>
+
+<li><a href="mailto:?subject=Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody&body=I%20thought%20you%20would%20find%20this%20article%20interesting.%20From%20PLOS ONE:%20https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareEmail" rel="noreferrer" aria-label="Email"><img src="/plosone/resource/img/icon.email.16.png" width="16" height="16" alt="Email">Email</a></li>
+ </ul>
+</div></div>
+
+
+ <!-- Crossmark 2.0 widget -->
+<script src="https://crossmark-cdn.crossref.org/widget/v2.0/widget.js"></script>
+<a data-target="crossmark"><img width="150" src="https://crossmark-cdn.crossref.org/widget/v2.0/logos/CROSSMARK_BW_horizontal.svg"></a>
+<!-- End Crossmark 2.0 widget -->
+
+
+
+
+
+
+
+<div class="skyscraper-container">
+ <div class="title">Advertisement</div>
+<!-- DoubleClick Ad Zone -->
+ <div class='advertisement' id='div-gpt-ad-1458247671871-1' style='width:160px; height:600px;'>
+ <script type='text/javascript'>
+ googletag.cmd.push(function() { googletag.display('div-gpt-ad-1458247671871-1'); });
+ </script>
+ </div>
+</div>
+
+
+
+
+<div class="subject-areas-container">
+ <h3>Subject Areas <div id="subjInfo">?</div>
+ <div id="subjInfoText">
+ <p>For more information about PLOS Subject Areas, click
+ <a href="https://github.com/PLOS/plos-thesaurus/blob/develop/README.md" target="_blank" title="Link opens in new window">here</a>.</p>
+ <span class="inline-intro">We want your feedback.</span> Do these Subject Areas make sense for this article? Click the target next to the incorrect Subject Area and let us know. Thanks for your help!
+
+
+ </div>
+ </h3>
+ <ul id="subjectList">
+ <li>
+ <a class="taxo-term" title="Search for articles about Chickens"
+ href="/plosone/search?filterSubjects=Chickens&filterJournals=PLoSONE&q=">Chickens</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Chickens"><p class="taxo-explain">Is the Subject Area <strong>"Chickens"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Antibodies"
+ href="/plosone/search?filterSubjects=Antibodies&filterJournals=PLoSONE&q=">Antibodies</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Antibodies"><p class="taxo-explain">Is the Subject Area <strong>"Antibodies"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Livestock"
+ href="/plosone/search?filterSubjects=Livestock&filterJournals=PLoSONE&q=">Livestock</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Livestock"><p class="taxo-explain">Is the Subject Area <strong>"Livestock"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Attenuated vaccines"
+ href="/plosone/search?filterSubjects=Attenuated+vaccines&filterJournals=PLoSONE&q=">Attenuated vaccines</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Attenuated vaccines"><p class="taxo-explain">Is the Subject Area <strong>"Attenuated vaccines"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Enzyme-linked immunoassays"
+ href="/plosone/search?filterSubjects=Enzyme-linked+immunoassays&filterJournals=PLoSONE&q=">Enzyme-linked immunoassays</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Enzyme-linked immunoassays"><p class="taxo-explain">Is the Subject Area <strong>"Enzyme-linked immunoassays"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Poultry"
+ href="/plosone/search?filterSubjects=Poultry&filterJournals=PLoSONE&q=">Poultry</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Poultry"><p class="taxo-explain">Is the Subject Area <strong>"Poultry"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Animal sexual behavior"
+ href="/plosone/search?filterSubjects=Animal+sexual+behavior&filterJournals=PLoSONE&q=">Animal sexual behavior</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Animal sexual behavior"><p class="taxo-explain">Is the Subject Area <strong>"Animal sexual behavior"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Vaccines"
+ href="/plosone/search?filterSubjects=Vaccines&filterJournals=PLoSONE&q=">Vaccines</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Vaccines"><p class="taxo-explain">Is the Subject Area <strong>"Vaccines"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ </ul>
+</div>
+<div id="subjectErrors"></div>
+
+
+<div class="twitter-container">
+ <h3>Archived Tweets</h3>
+ <ul id="tweetList">
+
+ </ul>
+ <div class="load-more">Load more <span></span></div>
+ <div class="view-all"><a href="https://alm.plos.org/works/doi.org/10.1371/journal.pone.0213978?source_id=twitter">View all tweets</a>
+ </div>
+</div>
+
+<script type="text/template" id="twitterModuleItemTemplate">
+ <% _.each(items, function(item) { %>
+ <li>
+ <div class="tweet-info">
+ <a href="https://twitter.com/<%= item.user %>">
+ <span class="imgholder">
+ <img class="imgLoad" src="<%= item.user_profile_image %>">
+ </span>
+ <div class="tweetDate"><%= item.created_at %></div>
+ <div class="tweetUser">
+ <strong><%= item.user_name %></strong>
+ <span>@<%= item.user %></span>
+ </div>
+ </a>
+ </div>
+ <div class="tweetText">
+ <%= item.text %>
+ </div>
+ <div id="tweetActions">
+ <a class="tweet-reply" href="https://twitter.com/intent/tweet?in_reply_to<%= item.id %>&amp;text=@<%= item.user %>">
+ <div>&nbsp;</div> Reply
+ </a>
+ <a class="tweet-retweet" href="https://twitter.com/intent/retweet?tweet_id=<%= item.id %>">
+ <div>&nbsp;</div> Retweet
+ </a>
+ <a class="tweet-favorite" href="https://twitter.com/intent/favorite?tweet_id=<%= item.id %>">
+ <div>&nbsp;</div> Favorite
+ </a>
+ </div>
+ </li>
+ <% }); %>
+</script>
+
+
+ </aside>
+</div>
+
+
+
+
+</main>
+
+<footer id="pageftr">
+ <div class="row">
+
+
+ <div class="block x-small">
+
+
+<ul class="nav nav-secondary">
+ <li class="ftr-header"><a href="https://www.plos.org/publications/journals/">Publications</a></li>
+ <li><a href="/plosbiology/" id="ftr-bio">PLOS Biology</a></li>
+ <li><a href="/plosmedicine/" id="ftr-med">PLOS Medicine</a></li>
+ <li><a href="/ploscompbiol/" id="ftr-compbio">PLOS Computational Biology</a></li>
+ <li><a href="/plosgenetics/" id="ftr-gen">PLOS Genetics</a></li>
+ <li><a href="/plospathogens/" id="ftr-path">PLOS Pathogens</a></li>
+ <li><a href="/plosone/" id="ftr-one">PLOS ONE</a></li>
+ <li><a href="/plosntds/" id="ftr-ntds">PLOS Neglected Tropical Diseases</a></li>
+ </ul>
+ </div>
+
+ <div class="block xx-small">
+
+
+<ul class="nav nav-tertiary">
+ <li>
+ <a href="https://www.plos.org" id="ftr-home">Home</a>
+ </li>
+ <li>
+ <a href="https://blogs.plos.org" id="ftr-blog">Blogs</a>
+ </li>
+ <li>
+ <a href="https://collections.plos.org" id="ftr-collections">Collections</a>
+ </li>
+ <li>
+ <a href="mailto:webmaster@plos.org" id="ftr-feedback">Give feedback</a>
+ </li>
+ <li>
+ <a href="/plosone/lockss-manifest" id="ftr-lockss">LOCKSS</a>
+ </li>
+</ul>
+ </div>
+ <div class="block xx-small">
+
+<ul class="nav nav-primary">
+ <li><a href="https://www.plos.org/privacy-policy" id="ftr-privacy">Privacy Policy</a></li>
+ <li><a href="https://www.plos.org/terms-of-use" id="ftr-terms">Terms of Use</a></li>
+ <li><a href="https://www.plos.org/advertise/" id="ftr-advertise">Advertise</a></li>
+ <li><a href="https://www.plos.org/media-inquiries" id="ftr-media">Media Inquiries</a></li>
+ <li><a href="https://www.plos.org/contact" id="ftr-contact">Contact</a></li>
+</ul>
+ </div>
+ <div class="block x-small">
+
+
+<p class="footer-non-profit-statement">PLOS is a nonprofit 501(c)(3) corporation, #C2354500, based in San Francisco, California, US</p> <img src="/plosone/resource/img/logo-plos-footer.png" alt="PLOS" class="logo-footer"/>
+ </div>
+
+
+
+&nbsp;
+<!--
+ Webapp build: 3.7.12 at 20191001163109 by teamcity, commit:
+ Service build: 2.5.4 at 20191001163228 by teamcity, commit:
+ Enabled dev features: []
+ -->
+
+ </div>
+
+
+
+</footer>
+
+
+
+
+<script type="text/javascript">
+ var ArticleData = {
+ doi: '10.1371/journal.pone.0213978',
+ title: '<article-title xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody<\/article-title>',
+ date: 'Apr 22, 2019'
+ };
+</script>
+
+<script type="text/javascript">
+ var ALM_CONFIG = ALM_CONFIG || {};
+ ALM_CONFIG.hostname = "https://alm.plos.org";
+ ALM_CONFIG.apiKey = "3pezRBRXdyzYW6ztfwft";
+ ALM_CONFIG.host = "https://alm.plos.org/api/v5/articles";
+</script>
+<script type="text/javascript">
+ var ALM_CONFIG = ALM_CONFIG || {};
+ ALM_CONFIG.hostname = "https://alm.plos.org";
+ ALM_CONFIG.apiKey = "3pezRBRXdyzYW6ztfwft";
+ ALM_CONFIG.host = "https://alm.plos.org/api/v5/articles";
+</script>
+
+
+
+
+
+
+<script type="text/javascript" async src="https://platform.twitter.com/widgets.js"></script>
+
+
+
+
+
+<!-- This file should be loaded before the renderJs, to avoid conflicts with the FigShare, that implements the MathJax also. -->
+
+<!-- mathjax configuration options -->
+<!-- more can be found at http://docs.mathjax.org/en/latest/ -->
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+ "HTML-CSS": {
+ scale: 100,
+ availableFonts: ["STIX","TeX"],
+ preferredFont: "STIX",
+ webFont: "STIX-Web",
+ linebreaks: { automatic: false }
+ },
+ jax: ["input/MathML", "output/HTML-CSS"]
+});
+</script>
+
+<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=MML_HTMLorMML"></script>
+
+<script src="/plosone/resource/compiled/asset_57YNUH6YLHJPCLJ7347ODA3HRPF472A4.js"></script>
+<div class="reveal-modal-bg"></div>
+</body>
+</html>
diff --git a/python/tests/files/scielo_article.jats.xml b/python/tests/files/scielo_article.jats.xml
new file mode 100644
index 0000000..08c864e
--- /dev/null
+++ b/python/tests/files/scielo_article.jats.xml
@@ -0,0 +1,336 @@
+<?xml version="1.0" encoding="ISO-8859-1"?><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<front>
+<journal-meta>
+<journal-id>1683-9803</journal-id>
+<journal-title><![CDATA[Pediatría (Asunción)]]></journal-title>
+<abbrev-journal-title><![CDATA[Pediatr. (Asunción)]]></abbrev-journal-title>
+<issn>1683-9803</issn>
+<publisher>
+<publisher-name><![CDATA[Sociedad Paraguaya de Pediatría]]></publisher-name>
+</publisher>
+</journal-meta>
+<article-meta>
+<article-id>S1683-98032015000200002</article-id>
+<article-id pub-id-type="doi">10.18004/ped.2015.agosto.102-107</article-id>
+<title-group>
+<article-title xml:lang="es"><![CDATA[Prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años en las comunidades indígenas de Yby Yau y Azote’y, 2011]]></article-title>
+<article-title xml:lang="en"><![CDATA[Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011]]></article-title>
+</title-group>
+<contrib-group>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Valiente]]></surname>
+<given-names><![CDATA[Syntia Carolina]]></given-names>
+</name>
+<xref ref-type="aff" rid="A01"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Cañete]]></surname>
+<given-names><![CDATA[Manuel]]></given-names>
+</name>
+<xref ref-type="aff" rid="A02"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Cohene Velazquez]]></surname>
+<given-names><![CDATA[Bartola]]></given-names>
+</name>
+<xref ref-type="aff" rid="A03"/>
+</contrib>
+</contrib-group>
+<aff id="A01">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A02">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A03">
+<institution><![CDATA[,Puesto de Salud de Paso Tuya. Azote’y. Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<pub-date pub-type="pub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<pub-date pub-type="epub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<volume>42</volume>
+<numero>2</numero>
+<fpage>102</fpage>
+<lpage>107</lpage>
+<copyright-statement/>
+<copyright-year/>
+<self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_arttext&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_abstract&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_pdf&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><abstract abstract-type="short" xml:lang="es"><p><![CDATA[Introducción: La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrición. La desnutrición infantil no es solo un problema de falta de alimentos, es un conflicto social más profundo. La prevalencia de desnutrición en menores de 5 años del país es de 5,9% según datos del Instituto Nacional de Alimentación y Nutrición. Objetivo: Determinar la prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años de las comunidades indígenas de Yby Yaú y Azote’y. Materiales y Métodos: Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identificó la prevalencia de desnutrición infantil en niños indígenas de las etnias Pa'i Tavyterã y Mbya Guaraní de 11 comunidades indígenas de Yby Yau y Azote’y. Fueron examinados 349 menores de 5 años de edad. Para la evaluación del estado nutricional se utilizó la curva de crecimiento de la OMS. Los niños/as fueron pesados/as en balanzas mecánicas. Para la medida de la altura, los mayores de dos años fueron medidos con el tallimetro y los menores de 2 años con cinta métrica. Resultados: Se observó desnutrición en 53 niños que equivale al 15% de la muestra. De estos 60,4% padecían de desnutrición moderada y 39,6% desnutrición grave. El mayor porcentaje de desnutrición se encontró en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los niños tenían desnutrición crónica. Conclusiones: La prevalencia de desnutrición en indígenas en Yby Yaú y Azote’y es de 15%, lo que sobrepasa los índices de desnutrición en menores de 5 años del país.]]></p></abstract>
+<abstract abstract-type="short" xml:lang="en"><p><![CDATA[Introduction: Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. Objective: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Yaú Yby. Materials and Methods: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyterá and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. Results: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. Conclusions: The prevalence of malnutrition in indigenous children in Yby Yaú and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.]]></p></abstract>
+<kwd-group>
+<kwd lng="es"><![CDATA[Desnutrición aguda]]></kwd>
+<kwd lng="es"><![CDATA[desnutrición crónica]]></kwd>
+<kwd lng="es"><![CDATA[indígenas]]></kwd>
+<kwd lng="en"><![CDATA[Acute malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[chronic malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[indigenous]]></kwd>
+</kwd-group>
+</article-meta>
+</front><body><![CDATA[ <p align="right"><font size="3" face="Verdana"><b>ART&Iacute;CULO ORIGINAL</b></font></p> <p align="left">&nbsp;</p> <p align="left"><font size="4" face="Verdana"><b>Prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en&nbsp; ni&ntilde;os menores de 5 a&ntilde;os en las comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y, 2011</b></font></p> <p align="left"><font size="3" face="Verdana"><b><i>Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011</i></b></font></p> <p align="center">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>Syntia Carolina Ruiz Valiente<sup>(1)</sup>, Manuel Ruiz Ca&ntilde;ete<sup>(2)</sup>, Bartola Cohene Velazquez<sup>(3)</sup></b></font></p> <p align="left"> <font size="2" face="Verdana">1. Hospital General Pedi&aacute;trico Ni&ntilde;os Acosta &Ntilde;u. Reducto-San Lorenzo, Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">2. Centro de Salud de Yby Yau. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">3. Puesto de Salud de Paso Tuya. Azote&rsquo;y. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana"><b>Correspondencia</b>: Syntia Carolina Ruiz Valiente. E-mail: scrv_py@hotmail.com</font></p> ]]></body>
+<body><![CDATA[<p align="left"> <font size="2" face="Verdana">Recibido: 24/01/2015; Aceptado: 10/06/2015.</font></p> <p align="left"> <font size="2" face="Verdana"><i>Los autores declaran que no existen conflictos de inter&eacute;s en el presente estudio.</i></font></p> <p align="left">&nbsp;</p> <hr size="1" noshade> <p align="left"><font size="2" face="Verdana"><b>RESUMEN</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introducci&oacute;n: </b>La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrici&oacute;n. La desnutrici&oacute;n infantil no es solo un problema de falta de alimentos, es un conflicto social m&aacute;s profundo. La prevalencia de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s es de 5,9% seg&uacute;n datos del Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n. <b>Objetivo</b>: Determinar la prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby Ya&uacute; y Azote&rsquo;y. <b>Materiales y M&eacute;todos:</b> Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa'i Tavyter&atilde; y Mbya Guaran&iacute; de 11 comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y. Fueron examinados 349 menores de 5 a&ntilde;os de edad. Para la evaluaci&oacute;n del estado nutricional se utiliz&oacute; la curva de crecimiento de la OMS. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas. Para la medida de la altura, los mayores de dos a&ntilde;os fueron medidos con el tallimetro y los menores de 2 a&ntilde;os con cinta m&eacute;trica. <b>Resultados:</b> Se observ&oacute; desnutrici&oacute;n en 53 ni&ntilde;os que equivale al 15% de la muestra. De estos 60,4% padec&iacute;an de desnutrici&oacute;n moderada y 39,6% desnutrici&oacute;n grave. El mayor porcentaje de desnutrici&oacute;n se encontr&oacute; en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los ni&ntilde;os ten&iacute;an desnutrici&oacute;n cr&oacute;nica. <b>Conclusiones:</b> La prevalencia de desnutrici&oacute;n en ind&iacute;genas en Yby Ya&uacute; y Azote&rsquo;y es de 15%, lo que sobrepasa los &iacute;ndices de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s.</font></p> <p align="left"><font size="2" face="Verdana"><b>Palabras clave:</b> Desnutrici&oacute;n aguda, desnutrici&oacute;n cr&oacute;nica, ind&iacute;genas.</font></p> <p align="left">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>ABSTRACT</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introduction:</b> Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age&nbsp; in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. <b>Objective</b>: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Ya&uacute; Yby. <b>Materials and Methods</b>: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyter&aacute; and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. <b>Results</b>: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. <b>Conclusions</b>: The prevalence of malnutrition in indigenous children in Yby Ya&uacute; and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.</font></p> <p align="left"><font size="2" face="Verdana"><b>Keywords</b>: Acute malnutrition, chronic malnutrition, indigenous.</font></p> <hr size="1" noshade> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>INTRODUCCI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">La desnutrici&oacute;n es una enfermedad multisist&eacute;mica, que afecta todos los &oacute;rganos y sistemas del ser humano, es producida por una disminuci&oacute;n dr&aacute;stica, aguda o cr&oacute;nica, en la disponibilidad de nutrimentos, ya sea por ingesti&oacute;n insuficiente, inadecuada absorci&oacute;n, exceso de p&eacute;rdidas o la conjunci&oacute;n de dos o m&aacute;s de estos factores. Se manifiesta por grados de d&eacute;ficit antropom&eacute;trico, signos y s&iacute;ntomas cl&iacute;nicos y alteraciones bioqu&iacute;micas, hematol&oacute;gicas e inmunol&oacute;gicas (1).</font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n ind&iacute;gena est&aacute; gravemente afectada por este problema, tal vez por ser un estrato olvidado y descuidado por la poblaci&oacute;n en general y por el estado paraguayo. A pesar de las leyes, y de todos los proyectos que favorecen a esta esfera de la sociedad, a&uacute;n existe un abismo inimaginable entre lo ideal y lo real. Mientras se elaboran programas que buscan dar mejores condiciones de vida a estas comunidades, que la mayor&iacute;a de las veces solo quedan plasmados en el papel, los &iacute;ndices de desnutrici&oacute;n son alarmantes. Esto se debe probablemente a que en la sociedad posmoderna, la deforestaci&oacute;n, el uso de agrot&oacute;xicos, la invasi&oacute;n de los terratenientes despoj&oacute; a los nativos de sus tierras, oblig&aacute;ndolos a vivir en situaciones carenciales, pues estos debido a su cultura esperan que la naturaleza les ofrezca el sustento diario. Las costumbres, la econom&iacute;a y la religi&oacute;n en las etnias Paí Tavyter&atilde; y Mby`a Guaran&iacute; est&aacute;n &iacute;ntimamente relacionadas a la producci&oacute;n alimenticia e ingesta.</font></p> <p align="left"><font size="2" face="Verdana">Para el nativo guaran&iacute; es muy dif&iacute;cil comprender que el hombre es el que debe producir alimento para su sustento, pero como la sociedad actual obliga a ello, estos por no conseguir adaptarse a los cambios que se produjeron, est&aacute;n m&aacute;s expuestos a las carencias alimentarias. Seg&uacute;n datos del gobierno central en el 2008, 41,8% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os padec&iacute;an de desnutrici&oacute;n.</font></p> <p align="left"><font size="2" face="Verdana">En un estudio realizado en M&eacute;xico, la prevalencia de desnutrici&oacute;n en ind&iacute;genas fue 39,4%(2). Un 44% present&oacute; uno o m&aacute;s signos cl&iacute;nicos de malnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Encuestas y Censos del Ecuador (2001 y 2006) 40,1% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os tienen desnutrici&oacute;n cr&oacute;nica (3).</font></p> <p align="left"><font size="2" face="Verdana">En Caracas, se hizo un estudio con la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, y ellos obtuvieron el siguiente resultado: El diagn&oacute;stico nutricional hallado con mayor frecuencia fue Nutrici&oacute;n normal (55%) seguida por Desnutrici&oacute;n Subcl&iacute;nica (15%) y Desnutrici&oacute;n Leve (12%). En l&iacute;neas generales, un 55% de la poblaci&oacute;n se encontraba en rangos de nutrici&oacute;n normal, mientras el 45% restante presentaba problema de malnutrici&oacute;n comprendiendo &eacute;sta por d&eacute;ficit y por exceso (4).</font></p> <p align="left"><font size="2" face="Verdana">En el Brasil en un estudio realizado para determinar el perfil nutricional de los abor&iacute;genes menores de 5 a&ntilde;os de Kaing&aacute;ngen Paran&aacute; vieron que cuando utilizado los criterios propuestos por la OMS, se registr&oacute; una alta prevalencia de d&eacute;ficit Estatura/Edad, con uno en cuatro ni&ntilde;os (24,8%) que presentaba este diagn&oacute;stico. El d&eacute;ficit de Peso/Edad fue diagnosticado en 9,2% de los ni&ntilde;os evaluados. Los &iacute;ndices de peso para la altura diagnosticaron solo tres ni&ntilde;os (2,1%) como desnutridas agudas (5).</font></p> <p align="left"><font size="2" face="Verdana">En otro estudio realizado tambi&eacute;n en el Brasil, esta vez en Amazonia, con ni&ntilde;os de la etnia Suru&iacute; se observ&oacute; que los porcentajes de los ni&ntilde;os con d&eacute;ficit en los &iacute;ndices de estatura para la edad fue 31,4%, peso para la edad 12,4% y peso para la estatura 0% (6).</font></p> <p align="left"><font size="2" face="Verdana">El objetivo del presente estudio es determinar la prevalencia de desnutrici&oacute;n en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y y conocer el comportamiento alimentario de los ni&ntilde;os/as de las comunidades ind&iacute;genas estudiadas.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>MATERIALES Y M&Eacute;TODOS</b></font></p> <p align="left"><font size="2" face="Verdana">Estudio transversal, descriptivo realizado en el periodo de enero a abril del a&ntilde;o 2011, donde se identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa&#297; Tavyter&atilde; y Mby`a Guaran&iacute; en los distritos de Yby-Ya&uacute; y Azote&rsquo;y.</font></p> <p align="left"><font size="2" face="Verdana">El tama&ntilde;o muestral total fue de 370 ni&ntilde;os, determinado a trav&eacute;s de censo realizado por el Centro de Salud de Yby-Ya&uacute; y el Puesto de Salud de Paso Tuya. Para los fines del estudio fueron identificados 349 ni&ntilde;os (94.3%) de ni&ntilde;os reci&eacute;n nacidos a menores de 5 a&ntilde;os en los distritos de Yby-Ya&uacute; y Azote'y.</font></p> <p align="left"><font size="2" face="Verdana">Las etnias que se encuentran dentro del &aacute;rea de estudio est&aacute; compuesta por los mby`a guaran&iacute; y los pa&#297; tavyter&atilde;, distribuidas en las siguientes comunidades ind&iacute;genas: Vy'apav&#7869;, Yrapey, Guyrakeha, Guyra &Ntilde;e'engatuamba, Satí;, San Juan, Mbery'o Jaguarymi, Ka'aguy Poty Rory, Yvyra'ija, Tukambiju y Takuaritiy.</font></p> <p align="left"><font size="2" face="Verdana">El trabajo se realiz&oacute; por concentraci&oacute;n, en los locales fijados por los l&iacute;deres de las distintas comunidades. Fue aplicado un cuestionario a las madres, creado para el efecto por medio de entrevista. La edad de los ni&ntilde;os fue dada por las madres, pues la mayor&iacute;a de estas no cuentan con registro de nacimiento, ni siquiera certificado de nacido vivo.</font></p> <p align="left"><font size="2" face="Verdana">Para la evaluaci&oacute;n del estado nutricional de los ni&ntilde;os se opt&oacute; por la curva del gr&aacute;fico de crecimiento de la Organizaci&oacute;n Mundial de la Salud (OMS) lo cual est&aacute; contenido en la libreta del ni&ntilde;o y la ni&ntilde;a. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas, los que ya consegu&iacute;an quedarse de pie fueron pesados en balanza de pie y los ni&ntilde;os menores de 1 a&ntilde;o en balanzas colgantes.</font></p> <p align="left"><font size="2" face="Verdana">Para la medida de la altura, los ni&ntilde;os mayores de dos a&ntilde;os fueron colocados en posici&oacute;n de pie, bien rectos, y fueron medidos con el tallimetro. La talla de los ni&ntilde;os menores de 2 a&ntilde;os fue realizada con cinta m&eacute;trica con el ni&ntilde;o/a en dec&uacute;bito supino en superficie recta.</font></p> <p align="left"><font size="2" face="Verdana">Los datos fueron analizados manualmente, y los gr&aacute;ficos confeccionados con el programa Microsoft Office Excel 2007.</font></p> <p align="justify">&nbsp;</p> ]]></body>
+<body><![CDATA[<p align="left"><font size="3" face="Verdana"><b>RESULTADOS</b></font></p> <p align="left"><font size="2" face="Verdana">Se evaluaron 349 ni&ntilde;os, que representan el 94,3% del total de abor&iacute;genes menores de 5 a&ntilde;os de las comunidades de Yby-Ya&uacute; y Azote&rsquo;y. Del total de 349 ni&ntilde;os, 69 % (240) son Paí; Tavyter&atilde; y 31% (109) Mby`a Guaran&iacute;. </font></p> <p align="left"><font size="2" face="Verdana">La comunidad con el mayor porcentaje de ni&ntilde;os fue la de Vy'&atilde;pav&#7869; (36,4%), y la de menor frecuencia fue la comunidad de Tekoha Kag&atilde;t&atilde;, que es una comunidad reci&eacute;n formada localizada en Pasi&ntilde;o (<a href="#2a02f1">Figura 1</a>).</font></p> <p align="center"><a name="2a02f1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f1.jpg"></p> <p align="left"><font size="2" face="Verdana">Viendo el perfil nutricional de los ni&ntilde;os, se pudo observar que 61% de los ni&ntilde;os/as no est&aacute;n desnutridos, 24% de los ni&ntilde;os/as est&aacute;n en riesgo de desnutrici&oacute;n y 15% est&aacute;n con desnutrici&oacute;n. Aunque se trata de un estrato social desfavorecido tambi&eacute;n se observa &iacute;ndice de sobrepeso y obesidad, en las comunidades de Vy'&atilde;pav&#7869; e Yrapey (<a href="#2a02f2">Figura 2</a>).</font></p> <p align="center"><a name="2a02f2"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f2.jpg"></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">Teniendo presente los gr&aacute;ficos de Talla/Edad la prevalencia de desnutrici&oacute;n cr&oacute;nica es bastante elevada, pues 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. El mayor &iacute;ndice de desnutrici&oacute;n se encuentran en los primeros 24 meses de vida (<a href="#2a02t1">Tabla 1</a>). De los 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. Siendo que el mayor porcentaje de desnutrici&oacute;n se observa en Vy'&atilde;pav&#7869;.</font></p> <p align="center"><a name="2a02t1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02t1.jpg"></p> <p align="left"><font size="2" face="Verdana">Se estudi&oacute; adem&aacute;s el comportamiento alimentario de estos ni&ntilde;os, viendo que alimentos preferencialmente hacen parte de su dieta y la edad de introducci&oacute;n de los mismos, la mayor&iacute;a de las madres introducen alg&uacute;n tipo alimento entre los 6 y 8 meses de edad (<a href="#2a02f3">Figura 3</a>) y los primeros alimentos introducidos dependen del lugar donde estos habitan. El caldo de pescado es uno de los primeros alimentos introducidos en las comunidades que viven cerca de los r&iacute;os, entretanto el 60% inician la alimentaci&oacute;n con caldo de arroz y caldo de fideo.</font></p> <p align="center"><a name="2a02f3"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f3.jpg"></p> <p align="left"><font size="2" face="Verdana">Al observar la frecuencia en que se alimentan estos ni&ntilde;os, el 64% se alimenta tres veces al d&iacute;a, el 20% menos de 3 veces al d&iacute;a y solo el 16 % m&aacute;s de tres veces al d&iacute;a.</font></p> <p align="left"><font size="2" face="Verdana">El principal nutriente en la dieta son los carbohidratos, el 47% de los ni&ntilde;os consumen carbohidratos m&aacute;s de 5 veces por semana, y el 21% menos de 3 veces por semana. El mayor porcentaje de consumo de prote&iacute;nas se observa en las comunidades que se encuentran cerca de r&iacute;os (Guyra &Ntilde;e`engatuamba y Mbery'o Jaguarymi), siendo que 70% consume prote&iacute;nas menos de 3 veces por semana, y solo el 3% m&aacute;s de cinco veces por semana. El consumo de verduras y hortalizas es muy escaso, el 91% consume verduras y hortalizas menos de 3 veces por semana, el 2% m&aacute;s de 5 veces y 7% entre 3 y 5 veces por semana.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>DISCUSI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">A lo largo de toda la historia de la humanidad, la desnutrici&oacute;n ha sido una patolog&iacute;a de las clases sociales menos privilegiadas, son los que no poseen las condiciones necesarias para tener una vida digna, donde la educaci&oacute;n, salud, recursos econ&oacute;micos son miserables, donde esta dolencia alcanza su auge (7).</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n los datos del Censo realizado por la Unidad de Salud Ind&iacute;gena que se encuentra en el Distrito de Yby-Ya&uacute;, los Puestos de Salud de Yby- Ya&uacute; y Azote&rsquo;y en el tercer trimestre del A&ntilde;o 2010, se encontraron 328 ni&ntilde;os de hasta 60 meses (8). Al realizar los trabajos de campo, este n&uacute;mero se elev&oacute; a 349 individuos, por lo que se hizo un nuevo censo solo con los ni&ntilde;os de este grupo etario. Ese fen&oacute;meno tal vez, se deba a la migraciones que se desarrollan normalmente entre los guaran&iacute;. Al observar la historia, y tambi&eacute;n por la experiencia que se adquiri&oacute; durante el trabajo de campo, se pudo observar la familia ling&uuml;&iacute;stica a la cual pertenecen los mby`a y los paí; (la guaran&iacute;) son n&oacute;madas, es com&uacute;n que migren a otras comunidades, en un mismo Tekoha (9,10).</b></font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n diana fue de 370 ni&ntilde;os menores de 5 a&ntilde;os de los cuales se lleg&oacute; a entrevistar a las madres de 349 y se hizo las mediciones antropom&eacute;tricas posteriormente. En la mayor&iacute;a de las comunidades ind&iacute;genas se obtuvo el 100% de participaci&oacute;n, son excepciones las comunidades de Yrapey y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Del total de ni&ntilde;os/as, la etnia de mayor prevalencia fue la de Paí; Tavyter&atilde;. En relaci&oacute;n al sexo, las comunidades son bastante equilibradas, con una ligera prevalencia del sexo masculino sobre el femenino.</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n datos de la UNICEF en Paraguay se observa 3,4% de desnutrici&oacute;n aguda en ni&ntilde;os menores de 5 a&ntilde;os (11). La prevalencia de desnutrici&oacute;n en los ni&ntilde;os paraguayos menores de 5 a&ntilde;os en el &aacute;rea rural es de 5,9% y en el &aacute;rea urbana es de 4,5% (12). Existen pocas publicaciones sobre este tema en abor&iacute;genes menores de 5 a&ntilde;os, siendo que el mayor n&uacute;mero de publicaciones fue realizado por el Brasil (12,4%), M&eacute;xico (39,4%) y Ecuador.</b></font></p> <p align="left"><font size="2" face="Verdana">La prevalencia de desnutrici&oacute;n en las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y es de 15,2%, observando los gr&aacute;ficos de Peso/edad si de 2 a&ntilde;os y Peso/Talla en mayores de 2 a&ntilde;os y menores de 5 a&ntilde;os. Las comunidades donde la desnutrici&oacute;n son m&aacute;s prevalentes son Guyrakeha e Yvyra'ija; en Satí; y Tekoha Kagat&atilde; no se encontr&oacute; ni&ntilde;os desnutridos.</b></font></p> <p align="left"><font size="2" face="Verdana">De 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. El grupo con mayor &iacute;ndice de desnutrici&oacute;n, se encuentra durante los primeros 24 meses, pues es en esta etapa donde el organismo requiere una mayor cantidad de nutrientes por el mayor crecimiento. Adem&aacute;s, despu&eacute;s de los 6 meses se inicia la introducci&oacute;n de otros alimentos. Estos dos factores, asociados aumentan el &iacute;ndice de desnutrici&oacute;n en este grupo de edad.</b></font></p> <p align="left"><font size="2" face="Verdana">De la poblaci&oacute;n total de los ni&ntilde;os estudiados el 23,8% est&aacute;n con riesgo de desnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n (INAN) en el a&ntilde;o 2010, 13,6% de ni&ntilde;os menores de 5 a&ntilde;os del &aacute;rea urbana y 16,2% del &aacute;rea rural del Paraguay sufren desnutrici&oacute;n cr&oacute;nica. En una encuesta realizada por la Direcci&oacute;n General de Estad&iacute;stica, Encuestas y Censos en el a&ntilde;o 2008, 41,8% de los ni&ntilde;os/as ind&iacute;genas menores de cinco a&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Observadas las medidas de Talla/Edad el 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Ese dato es alarmante, porque la desnutrici&oacute;n cr&oacute;nica es consecuencia de una carencia prolongada de alimentos o enfermedades sucesivas. En Tukambiju, Mbery'o Jaguarymi, Guyrakeha, Yvyra'ija y Satí; son comunidades con una prevalencia mayor al 80% de ni&ntilde;os/as con talla baja para la edad.</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">El &iacute;ndice de desnutrici&oacute;n en ind&iacute;genas en los distritos de Yby-Ya&uacute; y Azote&rsquo;y, sobrepasa la prevalencia general de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s, lo cual est&aacute; alrededor de 5.9% seg&uacute;n datos del INAN.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas se puede observar que un porcentaje razonable introduce alimentos entre los 6 meses y antes de los 9 meses. El porcentaje de los que introducen antes de los 6 meses es de 18,6% y entre los 9 meses y un a&ntilde;o es de 27%. Se pudo observar que, ocho ni&ntilde;os tuvieron lactancia materna exclusiva por m&aacute;s de 1 a&ntilde;o. Todos los ni&ntilde;os/as con lactancia materna exclusiva en la fecha de la recolecci&oacute;n de datos ten&iacute;a menos de 6 meses o 6 meses. El caldo de fideo y de arroz ocupa el primer y segundo lugar respectivamente como primer alimento introducido por las madres. Los alimentos que deber&iacute;an ser introducidos inicialmente como el pur&eacute; de frutas y verduras ocupan un peque&ntilde;o porcentaje en la lista. Otros alimentos que se tendr&iacute;an que introducir despu&eacute;s de los 9 meses, de preferencia a los un a&ntilde;o, como por ejemplo el caldo de poroto, caldo de pescado, leche de vaca y huevo son los primeros alimentos que se introducen.</b></font></p> <p align="left"><font size="2" face="Verdana">El 64% de los ni&ntilde;os se alimentan tres veces al d&iacute;a, el 20,5% menos de tres veces y 15,5% m&aacute;s de tres veces al d&iacute;a.</b></font></p> <p align="left"><font size="2" face="Verdana">El 69,5% de los ni&ntilde;os/as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y consumen prote&iacute;nas menos de tres veces por semana; 27,3% consumen de tres a cinco veces por semana los diferentes tipos de prote&iacute;nas, teniendo predominancia el consumo de pez. Solo 3,2% consume prote&iacute;nas m&aacute;s de 5 veces. Las comunidades que viven cerca de bosques, r&iacute;os o arroyos son los que m&aacute;s consumen prote&iacute;nas.</b></font></p> <p align="left"><font size="2" face="Verdana">Los carbohidratos son la principal fuente de alimentaci&oacute;n de los ni&ntilde;os y ni&ntilde;as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y. Eso se debe a que son los alimentos de m&aacute;s f&aacute;cil adquisici&oacute;n y los m&aacute;s accesibles econ&oacute;micamente hablando.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas el consumo de verduras y hortalizas es escaso. Las comunidades que m&aacute;s consumen verduras y hortalizas son Mberyo Jaguarymi y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Este trabajo refleja la realidad de las comunidades ind&iacute;genas de los dos distritos observados, no podemos extrapolar estas mismas cifras en el departamento de Concepci&oacute;n, o en todo el pa&iacute;s por el tama&ntilde;o de la muestra, es necesario hacer nuevos estudios con un tama&ntilde;o muestral mayor para obtener una visi&oacute;n del verdadero estado nutricional de los ni&ntilde;os ind&iacute;genas. El porcentaje de desnutrici&oacute;n es alto, pero se trata de distritos con no muchos recursos econ&oacute;micos, donde la pobreza es una realidad a&uacute;n en otros estratos sociales.</b></font></p> <p align="left"><font size="2" face="Verdana">La realidad ind&iacute;gena es un problema real, y una manera de reducir estas cifras es ense&ntilde;&aacute;ndoles a producir su propio alimento. Para ello no debemos luchar con su cultura ni intentar hacerlos ver el mundo a trav&eacute;s de nuestra realidad, sino dentro de sus costumbres encontrar formas de que ellos tengan condiciones de un mejor porvenir.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>AGRADECIMIENTOS</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">A las comunidades ind&iacute;genas que participaron en nuestro estudio, los profesionales de blanco del Centro de Salud de Yby-Yau y Azote&rsquo;y, a la Comunidad de Hermanas de la Divina Providencia de Yby-Yau, a la Dra. Blanca Villalba y a la Dra. Gloria Mart&iacute;nez.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>REFERENCIAS</b></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">1. Monteiro CA. Fome, desnutri&ccedil;&atilde;o e pobreza: al&eacute;m da sem&acirc;ntica. Sa&uacute;de Soc. 2003;12(1):7-11. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102986&pid=S1683-9803201500020000200001&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">2. Vi&ntilde;as MR, Fr&iacute;as ML, Verd&uacute; JM. Entorno social y desnutrici&oacute;n en ni&ntilde;os de 1 a 4 a&ntilde;os de comunidades ind&iacute;genas de M&eacute;xico. Rev Esp Nutr Comunitaria. 2005;11(3):128-34. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102988&pid=S1683-9803201500020000200002&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">3. INEC. Ecuador: 40,1% de ind&iacute;genas con desnutrici&oacute;n cr&oacute;nica. Ecuador: Estudio del INEC; 2009. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102990&pid=S1683-9803201500020000200003&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">4. Chumpitaz D, Russo A, Del NogaL B, Case C, Lares M. Evaluaci&oacute;n nutricional de la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004. AVFT. 2006;25(1):26-31. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102992&pid=S1683-9803201500020000200004&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">5. Kuhl AM, Tittoni C, Leite MS, Bastos JL. Perfil Nutricional e fatores associados &agrave; ocorr&ecirc;ncia de desnutri&ccedil;&atilde;o entre crian&ccedil;as ind&iacute;genas Kaing&aacute;ng da Terra Ind&iacute;gena de Mangueirinha, Paran&aacute;, Brasil. Cad Sa&uacute;de P&uacute;blica. 2009;25(2):409-420. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102994&pid=S1683-9803201500020000200005&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">6. Orellana JD, Coimbra Jr. CE, Louren&ccedil;o AE, Santos RV. Estado nutricional e anemia en crian&ccedil;as Suru&iacute;, Amaz&ocirc;nia, Brasil. J Pediatr (Rio J). 2006;82(5):383-88. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102996&pid=S1683-9803201500020000200006&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">7. Organizaci&oacute;n de las Naciones Unidas. Foro permanente para las cuestiones ind&iacute;genas: informe sobre el quinto per&iacute;odo de sesiones (15 a 26 de mayo de 2006). Nueva York: Naciones Unidas; 2006. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102998&pid=S1683-9803201500020000200007&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">8. Centro de Salud de Yby-Yau. Censo local de las comunidades ind&iacute;genas. Yby-Yau; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103000&pid=S1683-9803201500020000200008&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">9. Chase-Sardi M, Brun A, Enciso MA. Situaci&oacute;n sociocultural, econ&oacute;mica, jur&iacute;dico-pol&iacute;tico actual de las comunidades ind&iacute;genas del Paraguay. Asunci&oacute;n: UCA; 1989. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103002&pid=S1683-9803201500020000200009&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">10. Meliá B, Grunberg G, Grunberg F. Paî -Tavyterã: etnograf&iacute;a guaran&iacute; del Paraguay contempor&aacute;neo. 2da. ed. Asunci&oacute;n: Centro de Estudios Antrop&oacute;logicos de la Universidad Cat&oacute;lica; 2008. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103004&pid=S1683-9803201500020000200010&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">11. FAO. Panorama de la seguridad alimentaria y nutricional en Am&eacute;rica Latina y el Caribe 2013. FAO; 2014. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103006&pid=S1683-9803201500020000200011&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --> </font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">12. Masi C, S&aacute;nchez Bernal S, Dallman D, Rodas A, Morinigo G, Mendoza L. Perfil nutricional de ni&ntilde;os menores de 5 a&ntilde;os que acuden a servicios p&uacute;blicos de salud en el Paraguay. Asunci&oacute;n: INAN; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103008&pid=S1683-9803201500020000200012&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> ]]></body><back>
+<ref-list>
+<ref id="B1">
+<label>1</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Monteiro]]></surname>
+<given-names><![CDATA[CA]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Fome, desnutrição e pobreza: além da semântica]]></article-title>
+<source><![CDATA[Saúde Soc]]></source>
+<year>2003</year>
+<volume>12</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>7-11</page-range></nlm-citation>
+</ref>
+<ref id="B2">
+<label>2</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Viñas]]></surname>
+<given-names><![CDATA[MR]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Frías]]></surname>
+<given-names><![CDATA[ML]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Verdú]]></surname>
+<given-names><![CDATA[JM]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="es"><![CDATA[Entorno social y desnutrición en niños de 1 a 4 años de comunidades indígenas de México]]></article-title>
+<source><![CDATA[Rev Esp Nutr Comunitaria]]></source>
+<year>2005</year>
+<volume>11</volume>
+<numero>3</numero>
+<issue>3</issue>
+<page-range>128-34</page-range></nlm-citation>
+</ref>
+<ref id="B3">
+<label>3</label><nlm-citation citation-type="book">
+<collab>INEC</collab>
+<source><![CDATA[Ecuador: 40,1% de indígenas con desnutrición crónica]]></source>
+<year>2009</year>
+<publisher-loc><![CDATA[Ecuador ]]></publisher-loc>
+<publisher-name><![CDATA[Estudio del INEC]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B4">
+<label>4</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chumpitaz]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Russo]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Del NogaL]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Case]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lares]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Evaluación nutricional de la población infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004]]></article-title>
+<source><![CDATA[AVFT]]></source>
+<year>2006</year>
+<volume>25</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>26-31</page-range></nlm-citation>
+</ref>
+<ref id="B5">
+<label>5</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Kuhl]]></surname>
+<given-names><![CDATA[AM]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Tittoni]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Leite]]></surname>
+<given-names><![CDATA[MS]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Bastos]]></surname>
+<given-names><![CDATA[JL]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Perfil Nutricional e fatores associados à ocorrência de desnutrição entre crianças indígenas Kaingáng da Terra Indígena de Mangueirinha, Paraná, Brasil]]></article-title>
+<source><![CDATA[Cad Saúde Pública]]></source>
+<year>2009</year>
+<volume>25</volume>
+<numero>2</numero>
+<issue>2</issue>
+<page-range>409-420</page-range></nlm-citation>
+</ref>
+<ref id="B6">
+<label>6</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Orellana]]></surname>
+<given-names><![CDATA[JD]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Coimbra Jr]]></surname>
+<given-names><![CDATA[CE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lourenço]]></surname>
+<given-names><![CDATA[AE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Santos]]></surname>
+<given-names><![CDATA[RV]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Estado nutricional e anemia en crianças Suruí, Amazônia, Brasil]]></article-title>
+<source><![CDATA[J Pediatr (Rio J)]]></source>
+<year>2006</year>
+<volume>82</volume>
+<numero>5</numero>
+<issue>5</issue>
+<page-range>383-88</page-range></nlm-citation>
+</ref>
+<ref id="B7">
+<label>7</label><nlm-citation citation-type="book">
+<collab>Organización de las Naciones Unidas</collab>
+<source><![CDATA[Foro permanente para las cuestiones indígenas: informe sobre el quinto período de sesiones (15 a 26 de mayo de 2006)]]></source>
+<year>2006</year>
+<publisher-loc><![CDATA[Nueva York ]]></publisher-loc>
+<publisher-name><![CDATA[Naciones Unidas]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B8">
+<label>8</label><nlm-citation citation-type="">
+<collab>Centro de Salud de Yby-Yau</collab>
+<source><![CDATA[Censo local de las comunidades indígenas]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Yby-Yau ]]></publisher-loc>
+</nlm-citation>
+</ref>
+<ref id="B9">
+<label>9</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chase-Sardi]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Brun]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Enciso]]></surname>
+<given-names><![CDATA[MA]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Situación sociocultural, económica, jurídico-político actual de las comunidades indígenas del Paraguay]]></source>
+<year>1989</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[UCA]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B10">
+<label>10</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Meliá]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[F]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Paî -Tavyterã: etnografía guaraní del Paraguay contemporáneo. 2da. ed]]></source>
+<year>2008</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[Centro de Estudios Antropólogicos de la Universidad Católica]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B11">
+<label>11</label><nlm-citation citation-type="book">
+<collab>FAO</collab>
+<source><![CDATA[Panorama de la seguridad alimentaria y nutricional en América Latina y el Caribe 2013]]></source>
+<year>2014</year>
+<publisher-name><![CDATA[FAO]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B12">
+<label>12</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Masi]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Sánchez Bernal]]></surname>
+<given-names><![CDATA[S]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Dallman]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Rodas]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Morinigo]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Mendoza]]></surname>
+<given-names><![CDATA[L]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Perfil nutricional de niños menores de 5 años que acuden a servicios públicos de salud en el Paraguay]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[INAN]]></publisher-name>
+</nlm-citation>
+</ref>
+</ref-list>
+</back>
+</article>
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
new file mode 100644
index 0000000..3f84ea4
--- /dev/null
+++ b/python/tests/files/small.json
@@ -0,0 +1,52 @@
+{
+ "title": "Dummy Example File",
+ "authors": [
+ {
+ "name": "Brewster Kahle",
+ "given_name": "Brewster",
+ "surname": "Kahle",
+ "affiliation": {
+ "department": "Faculty ofAgricultrial Engineering",
+ "laboratory": "Plant Physiology Laboratory",
+ "institution": "Technion-Israel Institute of Technology",
+ "address": {
+ "postCode": "32000",
+ "settlement": "Haifa",
+ "country": "Israel"
+ }
+ }
+ },
+ {"name": "J Doe", "given_name": "J", "surname": "Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678"
+ },
+ "date": "2000",
+ "citations": [
+ { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "grobid_timestamp": "2018-04-02T00:31+0000",
+ "grobid_version": "0.5.1-SNAPSHOT",
+ "language_code": "en"
+}
diff --git a/python/tests/files/small.xml b/python/tests/files/small.xml
new file mode 100644
index 0000000..4de4059
--- /dev/null
+++ b/python/tests/files/small.xml
@@ -0,0 +1,120 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <teiHeader xml:lang="en">
+ <encodingDesc>
+ <appInfo>
+ <application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+ <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+ </application>
+ </appInfo>
+ </encodingDesc>
+ <fileDesc>
+ <titleStmt>
+ <title level="a" type="main">Dummy Example File</title>
+ </titleStmt>
+ <publicationStmt>
+ <publisher/>
+ <availability status="unknown"><licence/></availability>
+ <date type="published" when="2000">2000</date>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Brewster</forename><surname>Kahle</surname></persName>
+ <affiliation key="aff0">
+ <orgName type="department">Faculty ofAgricultrial Engineering</orgName>
+ <orgName type="laboratory">Plant Physiology Laboratory</orgName>
+ <orgName type="institution">Technion-Israel Institute of Technology</orgName>
+ <address>
+ <postCode>32000</postCode>
+ <settlement>Haifa</settlement>
+ <country key="IL">Israel</country>
+ </address>
+ </affiliation>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Doe</surname></persName>
+ </author>
+ <author>
+ <affiliation key="aff0">
+ <orgName type="institution">Internet Archive</orgName>
+ </affiliation>
+ </author>
+ <title level="a" type="main">Dummy Example File</title>
+ </analytic>
+ <monogr>
+ <title level="m">Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678</title>
+ <imprint>
+ <date type="published" when="2000">2000</date>
+ </imprint>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass>
+ <keywords>
+ <term>Fake Data</term>
+ </keywords>
+ </textClass>
+ <abstract>
+ <p>Everything you ever wanted to know about nothing</p>
+ </abstract>
+ </profileDesc>
+ </teiHeader>
+ <text xml:lang="en">
+ <body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>
+Everything starts somewhere, as somebody<ref type="bibr" target="#b0">[1]</ref> once said.</p></div>
+
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">In Depth</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Meat</head><p>
+You know, for kids.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Potatos</head><p>
+QED.</p></div>
+ </body>
+ <back>
+ <div type="references">
+
+ <listBibl>
+
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">Everything is Wonderful</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">A</forename><surname>Seaperson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Letters in the Alphabet</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="1" to="11" />
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">All about Facts</title>
+ </analytic>
+ <monogr>
+ <title level="j">The Dictionary</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <date type="published" when="2011-03-28" />
+ </imprint>
+ </monogr>
+ <note>None</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
new file mode 100644
index 0000000..36d90ef
--- /dev/null
+++ b/python/tests/test_grobid.py
@@ -0,0 +1,79 @@
+
+import pytest
+import struct
+import responses
+
+from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from test_wayback import wayback_client, cdx_client
+
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
+with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
+ REAL_TEI_XML = f.read()
+
+@pytest.fixture
+def grobid_client():
+ client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ return client
+
+@responses.activate
+def test_grobid_503(grobid_client):
+
+ status = b'{"status": "done broke due to 503"}'
+ responses.add(responses.POST,
+ 'http://dummy-grobid/api/processFulltextDocument', status=503,
+ body=status)
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp['status_code'] == 503
+ assert resp['status'] == "error"
+
+@responses.activate
+def test_grobid_success(grobid_client):
+
+ responses.add(responses.POST,
+ 'http://dummy-grobid/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp['status_code'] == 200
+ assert resp['status'] == "success"
+ #print(type(resp['tei_xml']))
+ #print(type(REAL_TEI_XML))
+ assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
+
+@responses.activate
+def test_grobid_worker_cdx(grobid_client, wayback_client):
+
+ sink = BlackholeSink()
+ worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
+
+ responses.add(responses.POST,
+ 'http://dummy-grobid/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ with open('tests/files/example.cdx', 'r') as cdx_file:
+ pusher = CdxLinePusher(
+ worker,
+ cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
+ pusher_counts = pusher.run()
+ assert pusher_counts['total']
+ assert pusher_counts['pushed'] == 7
+ assert pusher_counts['pushed'] == worker.counts['total']
+
+ assert len(responses.calls) == worker.counts['total']
+
diff --git a/mapreduce/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..8497b10 100644
--- a/mapreduce/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
new file mode 100644
index 0000000..9a81852
--- /dev/null
+++ b/python/tests/test_html.py
@@ -0,0 +1,33 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler.html import extract_fulltext_url
+
+def test_extract_fulltext_url():
+
+ resp = extract_fulltext_url("asdf", b"asdf")
+ assert resp == {}
+
+ resp = extract_fulltext_url(
+ "http://dummy-site/",
+ b"""<html>
+ <head>
+ <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
+ </head>
+ <body>
+ <h1>my big article here</h1>
+ blah
+ </body>
+ </html>"""
+ )
+ assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
+ assert resp['technique'] == "citation_pdf_url"
+
+ with open('tests/files/plos_one_article.html', 'rb') as f:
+ resp = extract_fulltext_url(
+ "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
+ f.read(),
+ )
+ assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
new file mode 100644
index 0000000..e6e48ac
--- /dev/null
+++ b/python/tests/test_html_ingest.py
@@ -0,0 +1,14 @@
+
+import datetime
+import pytest
+
+from sandcrawler.html_ingest import *
+
+
+def test_html_extract_ojs3() -> None:
+
+ with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f:
+ ojs3_html = f.read()
+
+ fulltext = html_extract_body_teixml(ojs3_html)
+ assert fulltext['status'] == 'success'
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
new file mode 100644
index 0000000..bf26a98
--- /dev/null
+++ b/python/tests/test_html_metadata.py
@@ -0,0 +1,229 @@
+
+import datetime
+import pytest
+
+from sandcrawler.html_metadata import *
+
+
+def test_html_metadata_plos() -> None:
+
+ with open('tests/files/plos_one_article.html', 'r') as f:
+ plos_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
+ assert meta is not None
+ assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ assert meta.doi == "10.1371/journal.pone.0213978"
+ assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert meta.contrib_names == [
+ "Yang Li",
+ "Tuanjie Wang",
+ "Lin Wang",
+ "Mingjun Sun",
+ "Zhizhong Cui",
+ "Shuang Chang",
+ "Yongping Wu",
+ "Xiaodong Zhang",
+ "Xiaohui Yu",
+ "Tao Sun",
+ "Peng Zhao",
+ ]
+ assert meta.container_name == "PLOS ONE"
+ assert meta.container_abbrev == "PLOS ONE"
+ # "Apr 22, 2019"
+ assert meta.release_date == datetime.date(year=2019, month=4, day=22)
+ assert meta.first_page == "e0213978"
+ assert meta.issue == "4"
+ assert meta.volume == "14"
+ assert meta.container_issn == "1932-6203"
+ assert meta.publisher == "Public Library of Science"
+ assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert meta.release_type == "article-journal"
+ assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+
+
+def test_html_metadata_elife() -> None:
+
+ with open('tests/files/elife_article.html', 'r') as f:
+ elife_html = f.read()
+
+ meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
+ assert meta is not None
+ assert meta.title == "Parallel visual circuitry in a basal chordate"
+ assert meta.doi == "10.7554/eLife.44753"
+ assert meta.contrib_names == [
+ "Matthew J Kourakis",
+ "Cezar Borba",
+ "Angela Zhang",
+ "Erin Newman-Smith",
+ "Priscilla Salas",
+ "B Manjunath",
+ "William C Smith",
+ ]
+ assert meta.container_name == "eLife"
+ # 2019-04-18
+ assert meta.release_date == datetime.date(year=2019, month=4, day=18)
+ assert meta.publisher == "eLife Sciences Publications Limited"
+ assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+
+
+def test_html_metadata_peerj() -> None:
+
+ with open('tests/files/peerj_oa_article.html', 'r') as f:
+ peerj_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
+ assert meta is not None
+ assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ assert meta.doi == "10.7717/peerj.4375"
+ assert meta.contrib_names == [
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
+ ]
+ assert meta.container_name == "PeerJ"
+ # "2018-02-13"
+ assert meta.release_date == datetime.date(year=2018, month=2, day=13)
+ assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url
+
+
+def test_html_metadata_nature() -> None:
+
+ with open('tests/files/nature_article.html', 'r') as f:
+ nature_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
+ assert meta is not None
+ assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
+ assert meta.doi == "10.1038/d41586-020-02610-z"
+ assert meta.contrib_names == [
+ "Diana Kwon",
+ ]
+ assert meta.container_name == "Nature"
+ # "2020-09-10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.publisher == "Nature Publishing Group"
+ # note: some error in dublin code in nature HTML resulting in duplication
+ assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+
+
+def test_html_metadata_ojs3() -> None:
+
+ with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ ojs3_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
+ assert meta is not None
+ assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
+ assert meta.doi == "10.5210/fm.v25i10.10274"
+ assert meta.contrib_names == [
+ "Calvin Liang",
+ "Jevan Alexander Hutson",
+ "Os Keyes",
+ ]
+ assert meta.container_name == "First Monday"
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_issn == "1396-0466"
+ # "2020/09/10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.lang == "en"
+ assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_dlib() -> None:
+
+ with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ dlib_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
+ assert meta is not None
+ assert meta.doi == "10.1045/may2017-vanhyning"
+ # "2017-05-15"
+ assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
+def test_html_metadata_dc_case() -> None:
+ """
+ This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
+ """
+
+ snippet = """
+ <html>
+ <head>
+ <meta name="DC.Citation.Issue" content="123"/>
+ </head>
+ <body>Hi.</body>
+ </html>"""
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(snippet))
+ assert meta is not None
+ assert meta.issue == "123"
+
+@pytest.fixture
+def adblock() -> Any:
+ return load_adblock_rules()
+
+def test_html_resources(adblock) -> None:
+
+ with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ dlib_html = f.read()
+
+ resources = html_extract_resources(
+ "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html",
+ HTMLParser(dlib_html),
+ adblock,
+ )
+
+ assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources
+
+ # check that adblock working
+ for r in resources:
+ assert '/ga.js' not in r['url']
+
+ with open('tests/files/plos_one_article.html', 'r') as f:
+ plos_html = f.read()
+
+ resources = html_extract_resources(
+ "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
+ HTMLParser(plos_html),
+ adblock,
+ )
+
+ # check that custom adblock working
+ for r in resources:
+ assert 'crossmark-cdn.crossref.org' not in r['url']
+
+ with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ monday_html = f.read()
+
+ resources = html_extract_resources(
+ "https://firstmonday.org/blah/",
+ HTMLParser(monday_html),
+ adblock,
+ )
+
+ with open('tests/files/elife_article.html', 'r') as f:
+ elife_html = f.read()
+
+ resources = html_extract_resources(
+ "https://elife.org/blah/",
+ HTMLParser(elife_html),
+ adblock,
+ )
+
+ with open('tests/files/nature_article.html', 'r') as f:
+ nature_html = f.read()
+
+ resources = html_extract_resources(
+ "https://nature.com/blah/",
+ HTMLParser(nature_html),
+ adblock,
+ )
+
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
new file mode 100644
index 0000000..b51f721
--- /dev/null
+++ b/python/tests/test_ingest.py
@@ -0,0 +1,207 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import *
+from test_wayback import *
+from test_savepagenow import *
+from test_grobid import REAL_TEI_XML
+
+
+@pytest.fixture
+def ingest_worker(wayback_client, spn_client):
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ worker = IngestFileWorker(
+ wayback_client=wayback_client,
+ spn_client=spn_client,
+ grobid_client=grobid_client,
+ )
+ return worker
+
+@pytest.fixture
+def ingest_worker_pdf(wayback_client_pdf, spn_client):
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ pgrest_client = SandcrawlerPostgrestClient(
+ api_url="http://dummy-postgrest",
+ )
+ worker = IngestFileWorker(
+ wayback_client=wayback_client_pdf,
+ spn_client=spn_client,
+ grobid_client=grobid_client,
+ pgrest_client=pgrest_client,
+ )
+ return worker
+
+
+@responses.activate
+def test_ingest_success(ingest_worker_pdf):
+
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ pdf_bytes = f.read()
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "http://dummy-host/",
+ }
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=pdf_bytes)
+ responses.add(responses.GET,
+ 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ status=200,
+ body=json.dumps([]))
+ responses.add(responses.GET,
+ 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ status=200,
+ body=json.dumps([]))
+ responses.add(responses.POST,
+ 'http://dummy-grobid/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ resp = ingest_worker_pdf.process(request)
+
+ print(resp)
+ assert resp['hit'] == True
+ assert resp['status'] == "success"
+ assert resp['request'] == request
+ assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
+ assert type(resp['terminal']['terminal_dt']) == str
+ assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
+ assert resp['terminal']['terminal_status_code']
+ assert type(resp['file_meta']['size_bytes']) == int
+ assert resp['file_meta']['mimetype'] == "application/pdf"
+ assert resp['cdx']['url'] == TARGET + "/redirect"
+ assert 'warc_path' not in resp['cdx']
+ assert 'revisit_cdx' not in resp
+ assert resp['grobid']['status'] == "success"
+ assert resp['grobid']['status_code'] == 200
+ assert resp['grobid']['grobid_version']
+ assert 'fatcat_release' in resp['grobid']
+ assert 'grobid_version' not in resp['grobid']['metadata']
+ assert 'fatcat_release' not in resp['grobid']['metadata']
+ assert not 'tei_xml' in resp['grobid']
+ assert resp['pdf_meta']['status'] == "success"
+ assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
+ assert resp['pdf_meta'].get('text') is None
+
+@responses.activate
+def test_ingest_landing(ingest_worker):
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "http://dummy-host/",
+ }
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY)
+
+ # this is for second time around; don't want to fetch same landing page
+ # HTML again and result in a loop
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>")
+
+ resp = ingest_worker.process(request)
+
+ print(resp)
+ assert resp['hit'] == False
+ assert resp['status'] == "no-pdf-link"
+ assert resp['request'] == request
+ assert 'terminal' in resp
+ assert 'file_meta' not in resp
+ assert 'cdx' not in resp
+ assert 'revisit_cdx' not in resp
+ assert 'grobid' not in resp
+
+@responses.activate
+def test_ingest_blocklist(ingest_worker):
+
+ ingest_worker.base_url_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-url-blocklist"
+ assert resp['request'] == request
+
+
+@responses.activate
+def test_ingest_wall_blocklist(ingest_worker):
+
+ ingest_worker.wall_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-wall"
+ assert resp['request'] == request
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "blocked-cookie"
+ assert resp['request'] == request
+
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
new file mode 100644
index 0000000..429c6b0
--- /dev/null
+++ b/python/tests/test_live_wayback.py
@@ -0,0 +1,167 @@
+
+"""
+This file contains tests to run against "live" wayback services. They default
+to "skip" because you need authentication, and we shouldn't hit these services
+automatically in CI.
+
+Simply uncomment lines to run.
+"""
+
+import json
+import pytest
+
+from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata
+
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient()
+ return client
+
+@pytest.fixture
+def wayback_client():
+ client = WaybackClient()
+ return client
+
+@pytest.fixture
+def spn_client():
+ client = SavePageNowClient()
+ return client
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_fetch(cdx_client):
+
+ # org,plos,journals)/plosone/article?id=10.1371/journal.pone.0093949 20181105121428 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949 text/html 200 OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV - - 25338 240665973 MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz
+
+ url = "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949"
+ datetime = "20181105121428"
+ resp = cdx_client.fetch(url, datetime)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
+ assert resp.warc_csize == 25338
+ assert resp.warc_offset == 240665973
+ assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+
+ # bogus datetime; shouldn't match
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(url, "12345678123456")
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_lookup_best(cdx_client):
+
+ url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
+ resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")
+
+ # won't know datetime, hash, etc
+ assert resp.url in (url, url.replace("https://", "http://"))
+ assert resp.mimetype == "application/pdf"
+ assert resp.status_code == 200
+
+ url = "https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.gu33570g87v71007"
+ resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")
+
+ assert resp.url in (url, url.replace("https://", "http://"))
+ assert resp.mimetype == "text/html"
+ assert resp.status_code == 200
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_wayback_fetch(wayback_client):
+
+ resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+
+ assert resp.body
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_lookup_resource_success(wayback_client):
+
+ url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.terminal_url in (url, url.replace("https://", "http://"))
+ assert resp.cdx.url in (url, url.replace("https://", "http://"))
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_fetch_spn2(cdx_client):
+
+ # https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 20200110210133
+
+ # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20191201203206 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 FPXVUJR7RXVGO6RIY5HYB6JVT7OD53SG - - 5026 364192270 liveweb-20191201204645/live-20191201195942-wwwb-app52.us.archive.org.warc.gz
+ # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20200110210044 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 OIQ3TKPBQLYYXQDIG7D2ZOK7IJEUEAQ7 - - 5130 710652442 liveweb-20200110204521-wwwb-spn20.us.archive.org-8001.warc.gz
+ # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20200110210133 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 G2MSFAYELECMFGKTYEHUN66WWNW4HXKQ - - 5126 544508422 liveweb-20200110205247-wwwb-spn01.us.archive.org-8000.warc.gz
+
+ url = "https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424"
+ datetime = "20200110210133"
+ resp = cdx_client.fetch(url, datetime, filter_status_code=200)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "G2MSFAYELECMFGKTYEHUN66WWNW4HXKQ"
+ assert resp.status_code == 200
+
+ # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
+
+ #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+
+ url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
+ datetime = "20200110222410"
+ resp = cdx_client.fetch(url, datetime, filter_status_code=200)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
+ assert resp.status_code == 200
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_lookup_ftp(wayback_client):
+ # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
+ # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf
+ # ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf
+
+ # revisit!
+ url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.terminal_url == url
+ assert resp.terminal_status_code == 226
+ assert resp.cdx.url == url
+ assert resp.revisit_cdx
+ assert resp.revisit_cdx.url != url
+
+ file_meta = gen_file_metadata(resp.body)
+ assert file_meta['sha1hex'] == resp.cdx.sha1hex
+
+ # not revisit?
+ url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.terminal_url == url
+ assert resp.terminal_status_code == 226
+ assert resp.cdx.url == url
+
+ file_meta = gen_file_metadata(resp.body)
+ assert file_meta['sha1hex'] == resp.cdx.sha1hex
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_crawl_ftp(spn_client, wayback_client):
+
+ url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
+ resp = spn_client.crawl_resource(url, wayback_client)
+
+ # FTP isn't supported yet!
+ #assert resp.hit == True
+ #assert resp.status == "success"
+ #assert resp.terminal_url == url
+ #assert resp.cdx.url == url
+
+ assert resp.hit == False
+ assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
new file mode 100644
index 0000000..29f9e9f
--- /dev/null
+++ b/python/tests/test_misc.py
@@ -0,0 +1,77 @@
+
+import pytest
+
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+
+def test_gen_file_metadata():
+
+ # valid (but very small) PDF file
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ file_meta = gen_file_metadata(f.read())
+ assert file_meta == {
+ 'mimetype': 'application/pdf',
+ 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
+ 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
+ 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
+ 'size_bytes': 13264,
+ }
+
+ # valid HTML
+ fm = gen_file_metadata(
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
+ assert fm['mimetype'] == 'text/html'
+
+ # bogus text
+ fm = gen_file_metadata(b"asdf1234")
+ assert fm['mimetype'] == 'text/plain'
+ assert fm['size_bytes'] == 8
+
+def test_b32_hex():
+
+ # valid b32
+ assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+
+ # sha1hex pass-through
+ s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ assert b32_hex(s) == s
+
+ # invalid
+ with pytest.raises(ValueError):
+ assert b32_hex('blah') == 'blah'
+
+def test_parse_cdx_line():
+
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ correct = {
+ 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ 'mimetype': "application/pdf",
+ 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'datetime': "20170828233154",
+ 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 'warc_offset': 931661233,
+ 'warc_csize': 210251,
+ 'http_status': 200,
+ }
+
+ assert parse_cdx_line(raw) == correct
+ assert parse_cdx_line(raw + "\n") == correct
+ assert parse_cdx_line(raw + " extra_field") == correct
+
+def test_invalid_cdx():
+
+ print("missing warc")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
+ assert parse_cdx_line(raw) == None
+
+ print("bad datetime")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) == None
+
+def test_clean_url():
+ assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
+ assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
+ "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
new file mode 100644
index 0000000..255e3fb
--- /dev/null
+++ b/python/tests/test_pdfextract.py
@@ -0,0 +1,68 @@
+
+import pytest
+import struct
+import responses
+import poppler
+
+from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from sandcrawler.pdfextract import process_pdf
+from test_wayback import wayback_client, cdx_client
+
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
+def test_process_fake_pdf():
+ resp = process_pdf(FAKE_PDF_BYTES)
+ print(resp)
+ assert resp.status == "not-pdf"
+
+ with open('tests/files/dummy_zip.zip', 'rb') as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == 'not-pdf'
+
+@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
+def test_process_dummy_pdf():
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == 'success'
+ assert resp.page0_thumbnail is not None
+ assert len(resp.text) > 10
+ assert resp.meta_xml is None
+ assert resp.file_meta['mimetype'] == 'application/pdf'
+ print(resp.pdf_info)
+ print(resp.pdf_extra)
+ assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
+ # 595 x 842
+ assert resp.pdf_extra['page0_height'] == 842
+ assert resp.pdf_extra['page0_width'] == 595
+ assert resp.pdf_extra['page_count'] == 1
+
+def test_pdfextract_worker_cdx(wayback_client):
+
+ sink = BlackholeSink()
+ worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
+
+ with open('tests/files/example.cdx', 'r') as cdx_file:
+ pusher = CdxLinePusher(
+ worker,
+ cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
+ pusher_counts = pusher.run()
+ assert pusher_counts['total']
+ assert pusher_counts['pushed'] == 7
+ assert pusher_counts['pushed'] == worker.counts['total']
+
+def test_pdfextract_blob_worker():
+
+ sink = BlackholeSink()
+ worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
+
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ pdf_bytes = f.read()
+
+ worker.process(pdf_bytes)
+
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
new file mode 100644
index 0000000..52f26c0
--- /dev/null
+++ b/python/tests/test_pushers.py
@@ -0,0 +1,28 @@
+
+import pytest
+
+from sandcrawler.workers import CdxLinePusher, BlackholeSink
+
+
+def test_cdx_line_pusher():
+
+ sink = BlackholeSink()
+
+ # vanilla (only default filters)
+ with open('tests/files/example.cdx', 'r') as cdx_file:
+ pusher = CdxLinePusher(sink, cdx_file)
+ counts = pusher.run()
+ assert counts['total'] == 20
+ assert counts['skip-parse'] == 1
+ assert counts['pushed'] == 19
+
+ # HTTP 200 and application/pdf
+ with open('tests/files/example.cdx', 'r') as cdx_file:
+ pusher = CdxLinePusher(sink, cdx_file,
+ filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
+ counts = pusher.run()
+ assert counts['total'] == 20
+ assert counts['skip-parse'] == 1
+ assert counts['skip-http_status'] == 10
+ assert counts['skip-mimetype'] == 2
+ assert counts['pushed'] == 7
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
new file mode 100644
index 0000000..63dd887
--- /dev/null
+++ b/python/tests/test_savepagenow.py
@@ -0,0 +1,204 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
+from test_wayback import *
+
+
+TARGET = "http://dummy-target.dummy"
+JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
+PENDING_BODY = {
+ "status": "pending",
+ "job_id": JOB_ID,
+ "resources": [
+ "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
+ "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
+ "https://cdn.onesignal.com/sdks/OneSignalSDK.js",
+ ]
+}
+SUCCESS_BODY = {
+ "status": "success",
+ "job_id": JOB_ID,
+ "original_url": TARGET + "/redirect",
+ "screenshot": "http://web.archive.org/screenshot/http://brewster.kahle.org/",
+ "timestamp": "20180326070330",
+ "duration_sec": 6.203,
+ "resources": [
+ TARGET,
+ TARGET + "/redirect",
+ "http://brewster.kahle.org/",
+ "http://brewster.kahle.org/favicon.ico",
+ "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg",
+ "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg",
+ "http://brewster.kahle.org/files/2017/01/computer-1294045_960_720-300x300.png",
+ "http://brewster.kahle.org/files/2017/11/20thcenturytimemachineimages_0000.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6041-1-300x225.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6061-768x1024.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6103-300x225.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6132-225x300.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6138-1-300x225.jpg",
+ "http://brewster.kahle.org/wp-content/themes/twentyten/images/wordpress.png",
+ "http://brewster.kahle.org/wp-content/themes/twentyten/style.css",
+ "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4",
+ "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
+ "http://platform.twitter.com/widgets.js",
+ "https://archive-it.org/piwik.js",
+ "https://platform.twitter.com/jot.html",
+ "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js",
+ "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html",
+ "https://syndication.twitter.com/settings",
+ "https://www.syndikat.org/en/joint_venture/embed/",
+ "https://www.syndikat.org/wp-admin/images/w-logo-blue.png",
+ "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamAdmin.css?ver=1.0",
+ "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamLoginForm.css?ver=1.0",
+ "https://www.syndikat.org/wp-content/plugins/user-access-manager/js/functions.js?ver=4.9.4",
+ "https://www.syndikat.org/wp-content/plugins/wysija-newsletters/css/validationEngine.jquery.css?ver=2.8.1",
+ "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
+ "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
+ "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
+ "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
+ ],
+ "outlinks":{
+ "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
+ "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
+ }
+}
+ERROR_BODY = {
+ "status": "error",
+ "exception": "[Errno -2] Name or service not known",
+ "status_ext": "error:invalid-host-resolution",
+ "job_id": JOB_ID,
+ "message": "Couldn't resolve host for http://example5123.com.",
+ "resources": []
+}
+CDX_SPN_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+]
+
+@pytest.fixture
+def spn_client():
+ client = SavePageNowClient(
+ v2endpoint="http://dummy-spnv2/save",
+ ia_access_key="dummy-access-key",
+ ia_secret_key="dummy-secret-key",
+ )
+ client.poll_seconds = 0.0
+ return client
+
+@responses.activate
+def test_savepagenow_success(spn_client):
+
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+
+ resp = spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 4
+
+ assert resp.success == True
+ assert resp.status == "success"
+ assert resp.request_url == TARGET
+ assert resp.terminal_url == TARGET + "/redirect"
+ assert resp.terminal_dt == SUCCESS_BODY['timestamp']
+ assert resp.resources == SUCCESS_BODY['resources']
+
+@responses.activate
+def test_savepagenow_remote_error(spn_client):
+
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(ERROR_BODY))
+
+ resp = spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 3
+
+ assert resp.success == False
+ assert resp.status == ERROR_BODY['status_ext']
+ assert resp.request_url == TARGET
+ assert resp.terminal_url == None
+ assert resp.terminal_dt == None
+ assert resp.resources == None
+
+@responses.activate
+def test_savepagenow_500(spn_client):
+
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=500,
+ body=json.dumps(ERROR_BODY))
+
+ with pytest.raises(SavePageNowError):
+ resp = spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 2
+
+@responses.activate
+def test_crawl_resource(spn_client, wayback_client):
+
+ responses.add(responses.POST,
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY)
+
+ print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
+ resp = spn_client.crawl_resource(TARGET, wayback_client)
+
+ assert len(responses.calls) == 5
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.body == WARC_BODY
+ assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
+
+ assert type(resp.cdx) == CdxPartial
+ with pytest.raises(AttributeError):
+ print(resp.cdx.warc_path)
+
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
new file mode 100644
index 0000000..6bc1ca4
--- /dev/null
+++ b/python/tests/test_wayback.py
@@ -0,0 +1,172 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
+
+
+CDX_TARGET = "http://fatcat.wiki/"
+CDX_DT = "20180812220054"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_SINGLE_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_MULTI_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner, but not right mimetype
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner and mimetype, but wrong status code
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # "best"
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # older
+ ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient(
+ host_url="http://dummy-cdx/cdx",
+ cdx_auth_token="dummy-token",
+ )
+ return client
+
+@responses.activate
+def test_cdx_fetch(cdx_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR"
+ assert resp.warc_csize == 8445
+ assert resp.warc_offset == 108062304
+ assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
+@responses.activate
+def test_cdx_fetch_errors(cdx_client):
+
+ with pytest.raises(ValueError):
+ resp = cdx_client.fetch(CDX_TARGET, "2019")
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch("http://some-other.com", CDX_DT)
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+ assert len(responses.calls) == 3
+
+@responses.activate
+def test_cdx_lookup_best(cdx_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
+
+ resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == CDX_BEST_SHA1B32
+ assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
+WARC_TARGET = "http://fatcat.wiki/"
+WARC_BODY = b"""
+<html>
+ <head>
+ <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
+ </head>
+ <body>
+ <h1>my big article here</h1>
+ blah
+ </body>
+</html>
+"""
+
+@pytest.fixture
+def wayback_client(cdx_client, mocker):
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ # mock out the wayback store with mock stuff
+ client.rstore = mocker.Mock()
+ resource = mocker.Mock()
+ client.rstore.load_resource = mocker.MagicMock(return_value=resource)
+ resource.get_status = mocker.MagicMock(return_value=(200, "Ok"))
+ resource.is_revisit = mocker.MagicMock(return_value=False)
+ resource.get_location = mocker.MagicMock(return_value=WARC_TARGET)
+ body = mocker.Mock()
+ resource.open_raw_content = mocker.MagicMock(return_value=body)
+ body.read = mocker.MagicMock(return_value=WARC_BODY)
+
+ return client
+
+@pytest.fixture
+def wayback_client_pdf(cdx_client, mocker):
+
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ pdf_bytes = f.read()
+
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ # mock out the wayback store with mock stuff
+ client.rstore = mocker.Mock()
+ resource = mocker.Mock()
+ client.rstore.load_resource = mocker.MagicMock(return_value=resource)
+ resource.get_status = mocker.MagicMock(return_value=(200, "Ok"))
+ resource.is_revisit = mocker.MagicMock(return_value=False)
+ resource.get_location = mocker.MagicMock(return_value=WARC_TARGET)
+ body = mocker.Mock()
+ resource.open_raw_content = mocker.MagicMock(return_value=body)
+ body.read = mocker.MagicMock(return_value=pdf_bytes)
+
+ return client
+
+@responses.activate
+def test_wayback_fetch(wayback_client):
+ resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
+ assert resp.body == WARC_BODY
+ assert resp.location == WARC_TARGET
+
+ resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
+ assert resp == WARC_BODY
+
+@responses.activate
+def test_lookup_resource_success(wayback_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
+
+ resp = wayback_client.lookup_resource(CDX_TARGET)
+
+ assert resp.hit == True
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
new file mode 100644
index 0000000..a996c56
--- /dev/null
+++ b/python/tests/test_xml.py
@@ -0,0 +1,18 @@
+
+import pytest
+
+from sandcrawler.xml import xml_reserialize
+
+
+def test_xml_reserialize() -> None:
+
+ with open('tests/files/scielo_article.jats.xml', 'rb') as f:
+ raw_xml = f.read()
+
+ assert b'encoding="ISO-8859-1"' in raw_xml
+ raw_xml.decode("ISO-8859-1")
+ with pytest.raises(UnicodeDecodeError):
+ raw_xml.decode("utf-8")
+
+ str_xml = xml_reserialize(raw_xml)
+ assert 'encoding="UTF-8"' in str_xml
diff --git a/python/title_slug_denylist.txt b/python/title_slug_denylist.txt
new file mode 120000
index 0000000..5bca386
--- /dev/null
+++ b/python/title_slug_denylist.txt
@@ -0,0 +1 @@
+../scalding/src/main/resources/slug-denylist.txt \ No newline at end of file
diff --git a/mapreduce/Pipfile b/python_hadoop/Pipfile
index 129b23e..42fb095 100644
--- a/mapreduce/Pipfile
+++ b/python_hadoop/Pipfile
@@ -25,6 +25,9 @@ requests = "*"
wayback = {version=">=0.2.1.2", index="ia"}
xmltodict = "*"
raven = "*"
+pykafka = "*"
+python-snappy = "*"
+boto3 = "*"
[requires]
python_version = "3.5"
diff --git a/python_hadoop/Pipfile.lock b/python_hadoop/Pipfile.lock
new file mode 100644
index 0000000..1d53667
--- /dev/null
+++ b/python_hadoop/Pipfile.lock
@@ -0,0 +1,990 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "d86e088fe8fe61715668eb35fa7a1d0a78670a782754b556aee0c7f741916aad"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.5"
+ },
+ "sources": [
+ {
+ "name": "ia",
+ "url": "https://devpi.archive.org/wb/prod",
+ "verify_ssl": true
+ },
+ {
+ "name": "pypi",
+ "url": "https://pypi.python.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "boto3": {
+ "hashes": [
+ "sha256:817b6f5e5277a9e370702314adbfcaa6957e138540e50d6b557a717846c6c999",
+ "sha256:8880415ca6d2531dd76c392a00824d952a3074886352bb342c8f8f1cb9403c1a"
+ ],
+ "index": "ia",
+ "version": "==1.9.99"
+ },
+ "botocore": {
+ "hashes": [
+ "sha256:9092d61cbf8052471dcaaac29f8cd1b9dbd5687947719f40dbc30a72c87523f2",
+ "sha256:ac50b9f793164a00ca725dfe60fe2d12a967272b251e6533236139dcade1ee5c"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==1.12.99"
+ },
+ "cachetools": {
+ "hashes": [
+ "sha256:219b7dc6024195b6f2bc3d3f884d1fef458745cd323b04165378622dcc823852",
+ "sha256:9efcc9fab3b49ab833475702b55edd5ae07af1af7a4c627678980b45e459c460"
+ ],
+ "version": "==3.1.0"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7",
+ "sha256:993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"
+ ],
+ "version": "==2018.11.29"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+ "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ ],
+ "version": "==3.0.4"
+ },
+ "click": {
+ "hashes": [
+ "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
+ "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==7.0"
+ },
+ "crawllib": {
+ "hashes": [
+ "sha256:01c47e22757482a7ffa15396a12dcfe27ac90347b1759ae6804d02a7ef6888cf"
+ ],
+ "version": "==0.1.4.1"
+ },
+ "dawg": {
+ "hashes": [
+ "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5",
+ "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953",
+ "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e",
+ "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8",
+ "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3",
+ "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18",
+ "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55",
+ "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9",
+ "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171",
+ "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9"
+ ],
+ "version": "==0.7.8"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e",
+ "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==4.3.2"
+ },
+ "docutils": {
+ "hashes": [
+ "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6",
+ "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274",
+ "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6"
+ ],
+ "version": "==0.14"
+ },
+ "dogpile.cache": {
+ "hashes": [
+ "sha256:691b7f199561c4bd6e7e96f164a43cc3781b0c87bea29b7d59d859f873fd4a31"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.7.1"
+ },
+ "elasticsearch": {
+ "hashes": [
+ "sha256:658380fd60bdaf746fef12958f0abc49063218ce93ee1ae4ca1fe6291c896433",
+ "sha256:ae91b089f2f2b5b3daa04297949e5f805ab12d187218cb587273f472656fd250"
+ ],
+ "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.7' and python_version < '4' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==5.5.3"
+ },
+ "flask": {
+ "hashes": [
+ "sha256:2271c0070dbcb5275fad4a82e29f23ab92682dc45f9dfbc22c02ba9b9322ce48",
+ "sha256:a080b744b7e345ccfcbc77954861cb05b3c63786e93f2b3875e0913d44b43f05"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==1.0.2"
+ },
+ "globalwayback": {
+ "hashes": [
+ "sha256:257c93800c82f77c35002978b2fd9db8e60e40744def0b18eea6b34e704260b8"
+ ],
+ "index": "ia",
+ "version": "==0.3.23.1"
+ },
+ "google-api-core": {
+ "hashes": [
+ "sha256:85693e163a1a6faea69a74f8feaf35d54dfa2559fbdbbe389c93ffb3bb4c9a79",
+ "sha256:eea2d223f7bdc6d68dd1c4681e17cded5a00b5a8e686e1597b89f27f58cf2980"
+ ],
+ "version": "==1.7.0"
+ },
+ "google-auth": {
+ "hashes": [
+ "sha256:0f7c6a64927d34c1a474da92cfc59e552a5d3b940d3266606c6a28b72888b9e4",
+ "sha256:20705f6803fd2c4d1cc2dcb0df09d4dfcb9a7d51fd59e94a3a28231fd93119ed"
+ ],
+ "version": "==1.6.3"
+ },
+ "google-cloud-core": {
+ "hashes": [
+ "sha256:9bee63e0991be9801a4baf0b7841cf54f86c6e7fec922f45ea74cd4032ed4ee4",
+ "sha256:d85b1aaaf3bad9415ad1d8ee5eadce96d7007a82f13ce0a0629a003a11e83f29"
+ ],
+ "version": "==0.29.1"
+ },
+ "google-cloud-dataproc": {
+ "hashes": [
+ "sha256:785e645690f344873cd6f22454db2a39236a2ce5af2b392efbb91ad57944ebac",
+ "sha256:e6a6c380757e22e9a45cf5b261be6d6a4262f87ee172a6c21f6f7ad6013827cd"
+ ],
+ "version": "==0.3.1"
+ },
+ "google-cloud-logging": {
+ "hashes": [
+ "sha256:104e8013afa3a75a8b40240205d7078b04dded332a29b0042b16df58f81c9a8c",
+ "sha256:13ac67399289b202b409e6cef7a87dea32ddabf902f69a677bd05554f6aecf0b"
+ ],
+ "version": "==1.10.0"
+ },
+ "google-cloud-storage": {
+ "hashes": [
+ "sha256:a3115c22a71e2f172fade72c7b7b797a071f3ac9b66043191fc84c214ba0c671",
+ "sha256:aef243b533144c11c9ff750565c43dffe5445debb143697002edb6205f64a437"
+ ],
+ "version": "==1.14.0"
+ },
+ "google-resumable-media": {
+ "hashes": [
+ "sha256:2dae98ee716efe799db3578a7b902fbf5592fc5c77d3c0906fc4ef9b1b930861",
+ "sha256:3e38923493ca0d7de0ad91c31acfefc393c78586db89364e91cb4f11990e51ba"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.3.2"
+ },
+ "googleapis-common-protos": {
+ "hashes": [
+ "sha256:d56ca712f67fff216d3be9eeeb8360ca59066d0365ba70b137b9e1801813747e"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==1.5.8"
+ },
+ "grpcio": {
+ "hashes": [
+ "sha256:0134bab8e8d16b195547f9216517b3abcd3e4b6b1f5a1c8940099888003287ac",
+ "sha256:084d4a5f34a671bd0ec4668d3a7a3351015de81e6d4aef6710d9dab026def8cc",
+ "sha256:1ab29724526d8651c8b878257775e17cf3fba7474c01edc76ff8bcfecf570f91",
+ "sha256:1bd017ca22a126af0d7d67b4140b427ae58fd6d79dbd277e6f21be3ee0fdfef7",
+ "sha256:25e7b619973e20d8f2cf05d6af0f2e11263a8792b99c058a5b590ef7aef554b8",
+ "sha256:2e836e6092e6639cc9edb486f27c6fe078408aac54ed345c5762edcf8588d9c2",
+ "sha256:34870eb5d157fe9639f263f0bfe0bcdc1737a6c08181ce113585f6461f37c84b",
+ "sha256:424c8f0748935932d28531ce6d817a11914dfb385b86fe815297f122cd04d592",
+ "sha256:43c42570f769748982c61a249e01eec5f91149e2aa98438c893de64e649d562b",
+ "sha256:4f845d13ecff25012fc9c7f22067fca1d2b3da3f693da146ddcc587fdab3e7b4",
+ "sha256:614de7d6672eb023c08dde70b103efa9faacf86ac63b2a24f8d74b064a86f6f0",
+ "sha256:6c5956292692f385bb12b5f47afd70ae9469d2ee07a949c94aef2946020c1300",
+ "sha256:7030674682433a5cbc069cd5a5fbcdf193c8a3680dc161cd7b984f72ab609f23",
+ "sha256:77fff21bee2d3c3487891cdb69b35190deddac609e48c05262e1097f0b2cd82a",
+ "sha256:8ac64f3e17e6a13abf9628f0ba22012c948d7ab400592510fed3c62444bdcc0d",
+ "sha256:8fdfa8129e1ab2cdf053956dd07b21ccc127c8a8f0c5b83ff60987c009ddb636",
+ "sha256:8ff4935abf61206479dd42c56aba0f6c395aebb5c42b29b1f7c2faae41ad979c",
+ "sha256:9af47d0f4137a2951b73ee592bdc5690b242cfe81cdfacba1b34becbf72a0d59",
+ "sha256:9da5b3c883621afca008d2c5729ddd7f06153f5dcaae1f690bead9b9018a3594",
+ "sha256:abe825aa49e6239d5edf4e222c44170d2c7f6f4b1fd5286b4756a62d8067e112",
+ "sha256:c8330efa27af2b65aa556a66517ba6657a13e259670ad32dec1b6ff3d6616c3c",
+ "sha256:dc3d09abe7b49e84516b53920320d0f0d05587f6398431e50d6a47bd7d27a8b6",
+ "sha256:deb08edefef880609f8bd2945764f31d577785ff3f2daea7027b67432ff12f74",
+ "sha256:e019c86f55cdcd2bbc239beab14167f2e03ee92407c7c42ddf42edf6f5640cce",
+ "sha256:eb0d154c4749458353fbb5a55b39de7aa8445617c20d200729f924be125c56d0",
+ "sha256:eed5edb8f2620ad1157c8c5786809fb0a2d885969287a758752ce514274e3be0",
+ "sha256:f7a9fc2dfbbc0e838c79f908262638fb86ab326b0fbc0ea2c3dd063b3561e9e2",
+ "sha256:f9df2e626f1a8d8114a9dc05a489bdf26a8e926fbbe43112669700f25fe0abb3"
+ ],
+ "version": "==1.18.0"
+ },
+ "happybase": {
+ "hashes": [
+ "sha256:e20376e2e32291798d2226502994134c1c4e175136d8375b3c517a234fa22481"
+ ],
+ "index": "ia",
+ "version": "==1.1.0"
+ },
+ "ialib": {
+ "hashes": [
+ "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.3.0.1"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "itsdangerous": {
+ "hashes": [
+ "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
+ "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==1.1.0"
+ },
+ "jinja2": {
+ "hashes": [
+ "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd",
+ "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==2.10"
+ },
+ "jmespath": {
+ "hashes": [
+ "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64",
+ "sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63"
+ ],
+ "version": "==0.9.3"
+ },
+ "kazoo": {
+ "hashes": [
+ "sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4",
+ "sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==2.5.0"
+ },
+ "markupsafe": {
+ "hashes": [
+ "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432",
+ "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b",
+ "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9",
+ "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af",
+ "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834",
+ "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd",
+ "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d",
+ "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7",
+ "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b",
+ "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3",
+ "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c",
+ "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2",
+ "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7",
+ "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36",
+ "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1",
+ "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e",
+ "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1",
+ "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c",
+ "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856",
+ "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550",
+ "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492",
+ "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672",
+ "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401",
+ "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6",
+ "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6",
+ "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c",
+ "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd",
+ "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==1.1.0"
+ },
+ "mrjob": {
+ "hashes": [
+ "sha256:1979504fd9a65ee0889ac7e4151fc30f1d32a6bdb2d3c74462b9f104aaf9be68",
+ "sha256:3ba27276e213d317efdd7183044d95f93c3a2175b16016f6b6506aeae1cd54d4"
+ ],
+ "index": "ia",
+ "version": "==0.6.7"
+ },
+ "pillow": {
+ "hashes": [
+ "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b",
+ "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe",
+ "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b",
+ "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608",
+ "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
+ "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
+ "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
+ "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
+ "sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b",
+ "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
+ "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05",
+ "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64",
+ "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7",
+ "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842",
+ "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a",
+ "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3",
+ "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984",
+ "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3",
+ "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c",
+ "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb",
+ "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0",
+ "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
+ "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
+ ],
+ "version": "==3.1.1"
+ },
+ "ply": {
+ "hashes": [
+ "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3",
+ "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"
+ ],
+ "version": "==3.11"
+ },
+ "protobuf": {
+ "hashes": [
+ "sha256:10394a4d03af7060fa8a6e1cbf38cea44be1467053b0aea5bbfcb4b13c4b88c4",
+ "sha256:1489b376b0f364bcc6f89519718c057eb191d7ad6f1b395ffd93d1aa45587811",
+ "sha256:1931d8efce896981fe410c802fd66df14f9f429c32a72dd9cfeeac9815ec6444",
+ "sha256:196d3a80f93c537f27d2a19a4fafb826fb4c331b0b99110f985119391d170f96",
+ "sha256:46e34fdcc2b1f2620172d3a4885128705a4e658b9b62355ae5e98f9ea19f42c2",
+ "sha256:4b92e235a3afd42e7493b281c8b80c0c65cbef45de30f43d571d1ee40a1f77ef",
+ "sha256:574085a33ca0d2c67433e5f3e9a0965c487410d6cb3406c83bdaf549bfc2992e",
+ "sha256:59cd75ded98094d3cf2d79e84cdb38a46e33e7441b2826f3838dcc7c07f82995",
+ "sha256:5ee0522eed6680bb5bac5b6d738f7b0923b3cafce8c4b1a039a6107f0841d7ed",
+ "sha256:65917cfd5da9dfc993d5684643063318a2e875f798047911a9dd71ca066641c9",
+ "sha256:685bc4ec61a50f7360c9fd18e277b65db90105adbf9c79938bd315435e526b90",
+ "sha256:92e8418976e52201364a3174e40dc31f5fd8c147186d72380cbda54e0464ee19",
+ "sha256:9335f79d1940dfb9bcaf8ec881fb8ab47d7a2c721fb8b02949aab8bbf8b68625",
+ "sha256:a7ee3bb6de78185e5411487bef8bc1c59ebd97e47713cba3c460ef44e99b3db9",
+ "sha256:ceec283da2323e2431c49de58f80e1718986b79be59c266bb0509cbf90ca5b9e",
+ "sha256:fcfc907746ec22716f05ea96b7f41597dfe1a1c088f861efb8a0d4f4196a6f10"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==3.6.1"
+ },
+ "publicsuffix": {
+ "hashes": [
+ "sha256:99a3a06d6eb19c57057d17560908b757995396ad76e6513c9d17e6a7a1266c91",
+ "sha256:ae77593d269e1e5131723259cc1142c25690c20c59f2e98f67e227228028bda9",
+ "sha256:eeb90d6cb0ae26d3af43f4d53f4c5eb6cfa437ad16a73c06c6caabb8f36ae1e5"
+ ],
+ "version": "==1.1.0"
+ },
+ "pyasn1": {
+ "hashes": [
+ "sha256:da2420fe13a9452d8ae97a0e478adde1dee153b11ba832a95b223a2ba01c10f7",
+ "sha256:da6b43a8c9ae93bc80e2739efb38cc776ba74a886e3e9318d65fe81a8b8a2c6e"
+ ],
+ "version": "==0.4.5"
+ },
+ "pyasn1-modules": {
+ "hashes": [
+ "sha256:79580acf813e3b7d6e69783884e6e83ac94bf4617b36a135b85c599d8a818a7b",
+ "sha256:a52090e8c5841ebbf08ae455146792d9ef3e8445b21055d3a3b7ed9c712b7c7c"
+ ],
+ "version": "==0.2.4"
+ },
+ "pykafka": {
+ "hashes": [
+ "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
+ ],
+ "index": "ia",
+ "version": "==2.8.0"
+ },
+ "pylru": {
+ "hashes": [
+ "sha256:e03a3d354eb8fdfa11638698e8a1f06cd3b3a214ebc0a120c603a79290d9ebec"
+ ],
+ "version": "==1.1.0"
+ },
+ "pymysql": {
+ "hashes": [
+ "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a",
+ "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"
+ ],
+ "version": "==0.9.3"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+ "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+ ],
+ "markers": "python_version >= '2.7'",
+ "version": "==2.8.0"
+ },
+ "python-snappy": {
+ "hashes": [
+ "sha256:59c79d83350f931ad5cf8f06ccb1c9bd1087a77c3ca7e00806884cda654a6faf",
+ "sha256:5fb0e2e5487e8ee462838ff928a186ba682bf519921d9b204db7d2b4fb6ced16",
+ "sha256:64ced2234becfe661962bc4c152e38cea03a2343ad6206a45d04c9ce61ad640f",
+ "sha256:748c2c9fec50d8a88861f369083067ec35b4a5d234f07b94bca70c6f89408f14",
+ "sha256:8a7f803f06083d4106d55387d2daa32c12b5e376c3616b0e2da8b8a87a27d74a"
+ ],
+ "index": "ia",
+ "version": "==0.5.3"
+ },
+ "pytz": {
+ "hashes": [
+ "sha256:32b0891edff07e28efe91284ed9c31e123d84bea3fd98e1f72be2508f43ef8d9",
+ "sha256:d5f05e487007e29e03409f9398d074e158d920d36eb82eaf66fb1136b0c5374c"
+ ],
+ "version": "==2018.9"
+ },
+ "pyyaml": {
+ "hashes": [
+ "sha256:3d7da3009c0f3e783b2c873687652d83b1bbfd5c88e9813fb7e5b03c0dd3108b",
+ "sha256:3ef3092145e9b70e3ddd2c7ad59bdd0252a94dfe3949721633e41344de00a6bf",
+ "sha256:40c71b8e076d0550b2e6380bada1f1cd1017b882f7e16f09a65be98e017f211a",
+ "sha256:558dd60b890ba8fd982e05941927a3911dc409a63dcb8b634feaa0cda69330d3",
+ "sha256:a7c28b45d9f99102fa092bb213aa12e0aaf9a6a1f5e395d36166639c1f96c3a1",
+ "sha256:aa7dd4a6a427aed7df6fb7f08a580d68d9b118d90310374716ae90b710280af1",
+ "sha256:bc558586e6045763782014934bfaf39d48b8ae85a2713117d16c39864085c613",
+ "sha256:d46d7982b62e0729ad0175a9bc7e10a566fc07b224d2c79fafb5e032727eaa04",
+ "sha256:d5eef459e30b09f5a098b9cea68bebfeb268697f78d647bd255a085371ac7f3f",
+ "sha256:e01d3203230e1786cd91ccfdc8f8454c8069c91bee3962ad93b87a4b2860f537",
+ "sha256:e170a9e6fcfd19021dd29845af83bb79236068bf5fd4df3327c1be18182b2531"
+ ],
+ "version": "==3.13"
+ },
+ "raven": {
+ "hashes": [
+ "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
+ "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
+ ],
+ "index": "ia",
+ "version": "==6.10.0"
+ },
+ "redis": {
+ "hashes": [
+ "sha256:724932360d48e5407e8f82e405ab3650a36ed02c7e460d1e6fddf0f038422b54",
+ "sha256:9b19425a38fd074eb5795ff2b0d9a55b46a44f91f5347995f27e3ad257a7d775"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==3.2.0"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
+ "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
+ ],
+ "index": "ia",
+ "version": "==2.21.0"
+ },
+ "requests-file": {
+ "hashes": [
+ "sha256:75c175eed739270aec3c5279ffd74e6527dada275c5c0d76b5817e9c86bb7dea",
+ "sha256:8f04aa6201bacda0567e7ac7f677f1499b0fc76b22140c54bc06edf1ba92e2fa"
+ ],
+ "version": "==1.4.3"
+ },
+ "robotexclusionrulesparser": {
+ "hashes": [
+ "sha256:d23aa14ae8145c13c95612d696736bad52a4bd0819ce8c9437ee745098fb8388"
+ ],
+ "version": "==1.7.1"
+ },
+ "rsa": {
+ "hashes": [
+ "sha256:14ba45700ff1ec9eeb206a2ce76b32814958a98e372006c8fb76ba820211be66",
+ "sha256:1a836406405730121ae9823e19c6e806c62bbad73f890574fff50efa4122c487"
+ ],
+ "version": "==4.0"
+ },
+ "s3transfer": {
+ "hashes": [
+ "sha256:7b9ad3213bff7d357f888e0fab5101b56fa1a0548ee77d121c3a3dbfbef4cb2e",
+ "sha256:f23d5cb7d862b104401d9021fc82e5fa0e0cf57b7660a1331425aab0c691d021"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.2.0"
+ },
+ "schedule": {
+ "hashes": [
+ "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
+ "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ ],
+ "version": "==0.6.0"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==1.12.0"
+ },
+ "sqlalchemy": {
+ "hashes": [
+ "sha256:8027fa183f5be466030617a497b2d64e0e16c8d615e5a34bdf9fab6f66bf4723"
+ ],
+ "version": "==1.2.18"
+ },
+ "surt": {
+ "hashes": [
+ "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ ],
+ "version": "==0.3.1"
+ },
+ "tabulate": {
+ "hashes": [
+ "sha256:8af07a39377cee1103a5c8b3330a421c2d99b9141e9cc5ddd2e3263fea416943"
+ ],
+ "version": "==0.8.3"
+ },
+ "thriftpy": {
+ "hashes": [
+ "sha256:309e57d97b5bfa01601393ad4f245451e989d6206a59279e56866b264a99796d",
+ "sha256:498960d6a4ebeaea1da4d85cea5d86b59c5a7aa93d5bc4c605ac33a11699e9db",
+ "sha256:6060f6354ba5aa3c0b071d87c216394d10b9116015bdba26634bafcaff86e0ca",
+ "sha256:67d8501b88e4ead17e3008db2261bcda5845e63d1e83b8168c5d96056990af3a",
+ "sha256:6baceabd40f0934186ebcfd1f559d34a9f165b65ac5d396a39ef7f61e44d9156"
+ ],
+ "version": "==0.3.9"
+ },
+ "tldextract": {
+ "hashes": [
+ "sha256:29797125db1f2e72ce2ee51f7a764ec8b1e6588812520795ffeae93bcd46bab4",
+ "sha256:84a0b275c262e34df7506e10767e357e8b5a755a3a620cdc2cfe035061f7806d"
+ ],
+ "version": "==2.2.0"
+ },
+ "twitter": {
+ "hashes": [
+ "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
+ "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
+ ],
+ "version": "==1.18.0"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
+ "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ ],
+ "markers": "python_version >= '3.4'",
+ "version": "==1.22"
+ },
+ "warctools": {
+ "hashes": [
+ "sha256:ce0c6e274db8ac8810f7c97b3943e8e8deadbc3f5c982db77cddaae2d2ae6170"
+ ],
+ "version": "==4.10.0"
+ },
+ "wayback": {
+ "hashes": [
+ "sha256:e095116ce5b71e2efb06afe6bdfbc7923906aeb87dc00f1225c2b7f7013070f6"
+ ],
+ "index": "ia",
+ "version": "==0.4.1.1"
+ },
+ "wayback-esp": {
+ "hashes": [
+ "sha256:4cd5d38da78115c07f6d95f109f7f5324b874c19ae1e59c2b026a4d707879b58"
+ ],
+ "version": "==0.2.2.2"
+ },
+ "wayback-search-js": {
+ "hashes": [
+ "sha256:0f358635e12c60d41625e1d1e0ec8fc76602f2c32c08337693a2406289abbe08"
+ ],
+ "version": "==1.4.17"
+ },
+ "wbex-client": {
+ "hashes": [
+ "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.1.5"
+ },
+ "werkzeug": {
+ "hashes": [
+ "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c",
+ "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b"
+ ],
+ "version": "==0.14.1"
+ },
+ "xmltodict": {
+ "hashes": [
+ "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21",
+ "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051"
+ ],
+ "index": "ia",
+ "version": "==0.12.0"
+ }
+ },
+ "develop": {
+ "astroid": {
+ "hashes": [
+ "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22",
+ "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==2.1.0"
+ },
+ "atomicwrites": {
+ "hashes": [
+ "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+ "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==1.3.0"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
+ "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
+ ],
+ "version": "==18.2.0"
+ },
+ "backcall": {
+ "hashes": [
+ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+ "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+ ],
+ "version": "==0.1.0"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7",
+ "sha256:993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"
+ ],
+ "version": "==2018.11.29"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+ "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ ],
+ "version": "==3.0.4"
+ },
+ "coverage": {
+ "hashes": [
+ "sha256:09e47c529ff77bf042ecfe858fb55c3e3eb97aac2c87f0349ab5a7efd6b3939f",
+ "sha256:0a1f9b0eb3aa15c990c328535655847b3420231af299386cfe5efc98f9c250fe",
+ "sha256:0cc941b37b8c2ececfed341444a456912e740ecf515d560de58b9a76562d966d",
+ "sha256:10e8af18d1315de936d67775d3a814cc81d0747a1a0312d84e27ae5610e313b0",
+ "sha256:1b4276550b86caa60606bd3572b52769860a81a70754a54acc8ba789ce74d607",
+ "sha256:1e8a2627c48266c7b813975335cfdea58c706fe36f607c97d9392e61502dc79d",
+ "sha256:2b224052bfd801beb7478b03e8a66f3f25ea56ea488922e98903914ac9ac930b",
+ "sha256:447c450a093766744ab53bf1e7063ec82866f27bcb4f4c907da25ad293bba7e3",
+ "sha256:46101fc20c6f6568561cdd15a54018bb42980954b79aa46da8ae6f008066a30e",
+ "sha256:4710dc676bb4b779c4361b54eb308bc84d64a2fa3d78e5f7228921eccce5d815",
+ "sha256:510986f9a280cd05189b42eee2b69fecdf5bf9651d4cd315ea21d24a964a3c36",
+ "sha256:5535dda5739257effef56e49a1c51c71f1d37a6e5607bb25a5eee507c59580d1",
+ "sha256:5a7524042014642b39b1fcae85fb37556c200e64ec90824ae9ecf7b667ccfc14",
+ "sha256:5f55028169ef85e1fa8e4b8b1b91c0b3b0fa3297c4fb22990d46ff01d22c2d6c",
+ "sha256:6694d5573e7790a0e8d3d177d7a416ca5f5c150742ee703f3c18df76260de794",
+ "sha256:6831e1ac20ac52634da606b658b0b2712d26984999c9d93f0c6e59fe62ca741b",
+ "sha256:77f0d9fa5e10d03aa4528436e33423bfa3718b86c646615f04616294c935f840",
+ "sha256:828ad813c7cdc2e71dcf141912c685bfe4b548c0e6d9540db6418b807c345ddd",
+ "sha256:85a06c61598b14b015d4df233d249cd5abfa61084ef5b9f64a48e997fd829a82",
+ "sha256:8cb4febad0f0b26c6f62e1628f2053954ad2c555d67660f28dfb1b0496711952",
+ "sha256:a5c58664b23b248b16b96253880b2868fb34358911400a7ba39d7f6399935389",
+ "sha256:aaa0f296e503cda4bc07566f592cd7a28779d433f3a23c48082af425d6d5a78f",
+ "sha256:ab235d9fe64833f12d1334d29b558aacedfbca2356dfb9691f2d0d38a8a7bfb4",
+ "sha256:b3b0c8f660fae65eac74fbf003f3103769b90012ae7a460863010539bb7a80da",
+ "sha256:bab8e6d510d2ea0f1d14f12642e3f35cefa47a9b2e4c7cea1852b52bc9c49647",
+ "sha256:c45297bbdbc8bb79b02cf41417d63352b70bcb76f1bbb1ee7d47b3e89e42f95d",
+ "sha256:d19bca47c8a01b92640c614a9147b081a1974f69168ecd494687c827109e8f42",
+ "sha256:d64b4340a0c488a9e79b66ec9f9d77d02b99b772c8b8afd46c1294c1d39ca478",
+ "sha256:da969da069a82bbb5300b59161d8d7c8d423bc4ccd3b410a9b4d8932aeefc14b",
+ "sha256:ed02c7539705696ecb7dc9d476d861f3904a8d2b7e894bd418994920935d36bb",
+ "sha256:ee5b8abc35b549012e03a7b1e86c09491457dba6c94112a2482b18589cc2bdb9"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*' and python_version < '4' and python_version != '3.2.*'",
+ "version": "==4.5.2"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e",
+ "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==4.3.2"
+ },
+ "happybase-mock": {
+ "hashes": [
+ "sha256:8c91787865c869ac6f5269768a75f5ea0c846162cdd82c5cf3de7aa09ed67c3b",
+ "sha256:ebc0026169f2f4456121269524599087fb3f416d2362d824657c4ce8ec2c355e"
+ ],
+ "index": "ia",
+ "version": "==0.10.0"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "ipython": {
+ "hashes": [
+ "sha256:06de667a9e406924f97781bda22d5d76bfb39762b678762d86a466e63f65dc39",
+ "sha256:5d3e020a6b5f29df037555e5c45ab1088d6a7cf3bd84f47e0ba501eeb0c3ec82"
+ ],
+ "index": "ia",
+ "version": "==7.3.0"
+ },
+ "ipython-genutils": {
+ "hashes": [
+ "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+ "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+ ],
+ "version": "==0.2.0"
+ },
+ "isort": {
+ "hashes": [
+ "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
+ "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8",
+ "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==4.3.4"
+ },
+ "jedi": {
+ "hashes": [
+ "sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd",
+ "sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191"
+ ],
+ "version": "==0.13.2"
+ },
+ "lazy-object-proxy": {
+ "hashes": [
+ "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
+ "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
+ "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
+ "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
+ "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
+ "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
+ "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
+ "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
+ "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
+ "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
+ "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
+ "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
+ "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
+ "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
+ "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
+ "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
+ "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
+ "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
+ "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
+ "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
+ "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
+ "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
+ "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
+ "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
+ "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
+ "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
+ "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
+ "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a",
+ "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b"
+ ],
+ "version": "==1.3.1"
+ },
+ "mccabe": {
+ "hashes": [
+ "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+ "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ ],
+ "version": "==0.6.1"
+ },
+ "more-itertools": {
+ "hashes": [
+ "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40",
+ "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1"
+ ],
+ "markers": "python_version > '2.7'",
+ "version": "==6.0.0"
+ },
+ "parso": {
+ "hashes": [
+ "sha256:4580328ae3f548b358f4901e38c0578229186835f0fa0846e47369796dd5bcc9",
+ "sha256:68406ebd7eafe17f8e40e15a84b56848eccbf27d7c1feb89e93d8fca395706db"
+ ],
+ "version": "==0.3.4"
+ },
+ "pathlib2": {
+ "hashes": [
+ "sha256:25199318e8cc3c25dcb45cbe084cc061051336d5a9ea2a12448d3d8cb748f742",
+ "sha256:5887121d7f7df3603bca2f710e7219f3eca0eb69e0b7cc6e0a022e155ac931a7"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==2.3.3"
+ },
+ "pexpect": {
+ "hashes": [
+ "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
+ "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
+ ],
+ "markers": "sys_platform != 'win32'",
+ "version": "==4.6.0"
+ },
+ "pickleshare": {
+ "hashes": [
+ "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+ "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+ ],
+ "version": "==0.7.5"
+ },
+ "pluggy": {
+ "hashes": [
+ "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
+ "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==0.8.1"
+ },
+ "prompt-toolkit": {
+ "hashes": [
+ "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+ "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+ "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==2.0.9"
+ },
+ "ptyprocess": {
+ "hashes": [
+ "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+ "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ ],
+ "version": "==0.6.0"
+ },
+ "py": {
+ "hashes": [
+ "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
+ "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==1.7.0"
+ },
+ "pygments": {
+ "hashes": [
+ "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
+ "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
+ ],
+ "version": "==2.3.1"
+ },
+ "pylint": {
+ "hashes": [
+ "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492",
+ "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c"
+ ],
+ "index": "ia",
+ "version": "==2.2.2"
+ },
+ "pytest": {
+ "hashes": [
+ "sha256:067a1d4bf827ffdd56ad21bd46674703fce77c5957f6c1eef731f6146bfcef1c",
+ "sha256:9687049d53695ad45cf5fdc7bbd51f0c49f1ea3ecfc4b7f3fde7501b541f17f4"
+ ],
+ "index": "ia",
+ "version": "==4.3.0"
+ },
+ "pytest-cov": {
+ "hashes": [
+ "sha256:0ab664b25c6aa9716cbf203b17ddb301932383046082c081b9848a0edf5add33",
+ "sha256:230ef817450ab0699c6cc3c9c8f7a829c34674456f2ed8df1fe1d39780f7c87f"
+ ],
+ "index": "ia",
+ "version": "==2.6.1"
+ },
+ "pytest-pythonpath": {
+ "hashes": [
+ "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ ],
+ "index": "ia",
+ "version": "==0.7.3"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
+ "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
+ ],
+ "index": "ia",
+ "version": "==2.21.0"
+ },
+ "responses": {
+ "hashes": [
+ "sha256:c85882d2dc608ce6b5713a4e1534120f4a0dc6ec79d1366570d2b0c909a50c87",
+ "sha256:ea5a14f9aea173e3b786ff04cf03133c2dabd4103dbaef1028742fd71a6c2ad3"
+ ],
+ "index": "ia",
+ "version": "==0.10.5"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==1.12.0"
+ },
+ "traitlets": {
+ "hashes": [
+ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+ "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==4.3.2"
+ },
+ "typed-ast": {
+ "hashes": [
+ "sha256:035a54ede6ce1380599b2ce57844c6554666522e376bd111eb940fbc7c3dad23",
+ "sha256:037c35f2741ce3a9ac0d55abfcd119133cbd821fffa4461397718287092d9d15",
+ "sha256:049feae7e9f180b64efacbdc36b3af64a00393a47be22fa9cb6794e68d4e73d3",
+ "sha256:19228f7940beafc1ba21a6e8e070e0b0bfd1457902a3a81709762b8b9039b88d",
+ "sha256:2ea681e91e3550a30c2265d2916f40a5f5d89b59469a20f3bad7d07adee0f7a6",
+ "sha256:3a6b0a78af298d82323660df5497bcea0f0a4a25a0b003afd0ce5af049bd1f60",
+ "sha256:5385da8f3b801014504df0852bf83524599df890387a3c2b17b7caa3d78b1773",
+ "sha256:606d8afa07eef77280c2bf84335e24390055b478392e1975f96286d99d0cb424",
+ "sha256:69245b5b23bbf7fb242c9f8f08493e9ecd7711f063259aefffaeb90595d62287",
+ "sha256:6f6d839ab09830d59b7fa8fb6917023d8cb5498ee1f1dbd82d37db78eb76bc99",
+ "sha256:730888475f5ac0e37c1de4bd05eeb799fdb742697867f524dc8a4cd74bcecc23",
+ "sha256:9819b5162ffc121b9e334923c685b0d0826154e41dfe70b2ede2ce29034c71d8",
+ "sha256:9e60ef9426efab601dd9aa120e4ff560f4461cf8442e9c0a2b92548d52800699",
+ "sha256:af5fbdde0690c7da68e841d7fc2632345d570768ea7406a9434446d7b33b0ee1",
+ "sha256:b64efdbdf3bbb1377562c179f167f3bf301251411eb5ac77dec6b7d32bcda463",
+ "sha256:bac5f444c118aeb456fac1b0b5d14c6a71ea2a42069b09c176f75e9bd4c186f6",
+ "sha256:bda9068aafb73859491e13b99b682bd299c1b5fd50644d697533775828a28ee0",
+ "sha256:d659517ca116e6750101a1326107d3479028c5191f0ecee3c7203c50f5b915b0",
+ "sha256:eddd3fb1f3e0f82e5915a899285a39ee34ce18fd25d89582bc89fc9fb16cd2c6"
+ ],
+ "markers": "python_version < '3.7' and implementation_name == 'cpython'",
+ "version": "==1.3.1"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
+ "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ ],
+ "markers": "python_version >= '3.4'",
+ "version": "==1.22"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ },
+ "wrapt": {
+ "hashes": [
+ "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
+ ],
+ "version": "==1.11.1"
+ }
+ }
+}
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
new file mode 100644
index 0000000..198c949
--- /dev/null
+++ b/python_hadoop/README.md
@@ -0,0 +1,104 @@
+
+Hadoop streaming map/reduce jobs written in python using the mrjob library.
+
+## Development and Testing
+
+System dependencies on Linux (ubuntu/debian):
+
+ sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential
+ pip3 install --user pipenv
+
+On macOS (using Homebrew):
+
+ brew install libjpeg pipenv
+
+You probably need `~/.local/bin` on your `$PATH`.
+
+Fetch all python dependencies with:
+
+ pipenv install --dev
+
+Run the tests with:
+
+ pipenv run pytest
+
+Check test coverage with:
+
+ pytest --cov --cov-report html
+ # open ./htmlcov/index.html in a browser
+
+## Troubleshooting
+
+If you get pipenv errors like:
+
+ AttributeError: '_NamespacePath' object has no attribute 'sort'
+
+ ----------------------------------------
+
+ Command "python setup.py egg_info" failed with error code 1 in /1/tmp/pip-install-h7lb6tqz/proto-google-cloud-datastore-v1/
+
+ ☤ ▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉ 0/8 — 00:00:03
+ bnewbold@bnewbold-dev$
+ bnewbold@bnewbold-dev$ pipenv install --deploy --dev
+ Installing dependencies from Pipfile.lock (e82980)…
+ An error occurred while installing proto-google-cloud-logging-v2==0.91.3! Will try again.
+ An error occurred while installing gapic-google-cloud-error-reporting-v1beta1==0.15.3! Will try again.
+ An error occurred while installing gapic-google-cloud-datastore-v1==0.15.3! Will try again.
+ An error occurred while installing proto-google-cloud-datastore-v1==0.90.4! Will try again.
+
+Then something has gone horribly wrong with your pip/pipenv/python setup. Don't
+have a good workaround yet.
+
+## Running Python Jobs on Hadoop
+
+The `../please` script automates these steps; you should use that instead.
+
+When running python streaming jobs on the actual hadoop cluster, we need to
+bundle along our python dependencies in a virtual env tarball. Building this
+tarball can be done like:
+
+ export PIPENV_VENV_IN_PROJECT=1
+ pipenv install --deploy
+ tar -czf venv-current.tar.gz -C .venv .
+
+### Extraction Task
+
+An example actually connecting to HBase from a local machine, with thrift
+running on a devbox and GROBID running on a dedicated machine:
+
+ ./extraction_cdx_grobid.py \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
+ tests/files/example.cdx
+
+Running from the cluster (once a ./venv-current.tar.gz tarball exists):
+
+ ./extraction_cdx_grobid.py \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
+ -r hadoop \
+ -c mrjob.conf \
+ --archive venv-current.tar.gz#venv \
+ hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx
+
+### Backfill Task
+
+An example actually connecting to HBase from a local machine, with thrift
+running on a devbox:
+
+ ./backfill_hbase_from_cdx.py \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ tests/files/example.cdx
+
+Running from the cluster (once a ./venv-current.tar.gz tarball exists):
+
+ ./backfill_hbase_from_cdx.py \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ -r hadoop \
+ -c mrjob.conf \
+ --archive venv-current.tar.gz#venv \
+ hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/python_hadoop/backfill_hbase_from_cdx.py
index 6b2ec0b..6b2ec0b 100755
--- a/mapreduce/backfill_hbase_from_cdx.py
+++ b/python_hadoop/backfill_hbase_from_cdx.py
diff --git a/mapreduce/common.py b/python_hadoop/common.py
index 6710044..e596b35 100644
--- a/mapreduce/common.py
+++ b/python_hadoop/common.py
@@ -1,4 +1,5 @@
+import json
from datetime import datetime
NORMAL_MIME = (
@@ -71,3 +72,28 @@ def parse_cdx_line(raw_cdx):
# 'i' intentionally not set
heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
+
+def parse_ungrobided_line(raw_line):
+
+ line = raw_line.strip().split("\t")
+ if len(line) != 4:
+ return None
+
+ key = line[0]
+ mime = normalize_mime(line[2])
+ try:
+ f_c = json.loads(line[1])
+ cdx = json.loads(line[3])
+ except json.JSONDecodeError:
+ return None
+
+ if not (key[5:].isalnum() and len(key) == 37 and mime != None):
+ print(mime)
+ print(key)
+ print("FAIL")
+ return None
+
+ if '-' in (key, mime, f_c, cdx):
+ return None
+
+ return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c}
diff --git a/mapreduce/extraction_cdx_grobid.py b/python_hadoop/extraction_cdx_grobid.py
index ed82a5e..88580e1 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/python_hadoop/extraction_cdx_grobid.py
@@ -17,6 +17,7 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import os
import xml
import json
import raven
@@ -26,10 +27,10 @@ import happybase
import mrjob
from mrjob.job import MRJob
import wayback.exception
-from wayback.resource import Resource
-from wayback.resource import ArcResource
+from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
+
from common import parse_cdx_line
from grobid2json import teixml2json
@@ -37,7 +38,7 @@ from grobid2json import teixml2json
sentry_client = raven.Client()
# Specific poison-pill rows we should skip
-KEY_BLACKLIST = (
+KEY_DENYLIST = (
'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
)
@@ -74,8 +75,10 @@ class MRExtractCdxGrobid(MRJob):
def __init__(self, *args, **kwargs):
super(MRExtractCdxGrobid, self).__init__(*args, **kwargs)
- self.mime_filter = ['application/pdf']
self.hb_table = None
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.mime_filter = ['application/pdf']
+ self.rstore = None
def grobid_process_fulltext(self, content):
r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
@@ -118,18 +121,39 @@ class MRExtractCdxGrobid(MRJob):
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.options.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.options.warc_uri_prefix))
try:
- rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
- gwb_record = rstore.load_resource(warc_uri, offset, c_size)
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox")
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",
reason="archived HTTP response (WARC) was not 200",
warc_status=gwb_record.get_status()[0])
- return gwb_record.open_raw_content().read(), None
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
def extract(self, info):
@@ -162,16 +186,18 @@ class MRExtractCdxGrobid(MRJob):
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.text)
- info['grobid0:status'] = {'status': 'success'}
+ info['grobid0:status'] = {'status': 'partial'}
info['grobid0:tei_xml'] = grobid_response.content
# Convert TEI XML to JSON
try:
- info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True)
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
except xml.etree.ElementTree.ParseError:
- return info, dict(status="fail", reason="GROBID 200 XML parse error")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
except ValueError:
- return info, dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
tei_metadata = info['grobid0:tei_json'].copy()
for k in ('body', 'annex'):
@@ -183,6 +209,7 @@ class MRExtractCdxGrobid(MRJob):
# TODO:
info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
return info, None
@@ -207,9 +234,9 @@ class MRExtractCdxGrobid(MRJob):
yield _, status
return
key = info['key']
- if key in KEY_BLACKLIST:
- self.increment_counter('lines', 'blacklist')
- yield _, dict(status='blacklist', key=key)
+ if key in KEY_DENYLIST:
+ self.increment_counter('lines', 'denylist')
+ yield _, dict(status='denylist', key=key)
return
# Note: this may not get "cleared" correctly
diff --git a/python_hadoop/extraction_ungrobided.py b/python_hadoop/extraction_ungrobided.py
new file mode 100755
index 0000000..225e46f
--- /dev/null
+++ b/python_hadoop/extraction_ungrobided.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+"""
+Variant of extraction_cdx_grobid which takes a partial metadata list as input
+instead of CDX.
+
+This task list is dumped by another Hadoop job which scans over the HBase table
+quickly, which allows this job to skip a (relatively) expensive HBase read
+per-row.
+
+Requires:
+- happybase
+- mrjob
+- wayback/GWB libraries
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import xml
+import json
+import raven
+import struct
+import requests
+import happybase
+import mrjob
+from mrjob.job import MRJob
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+from common import parse_ungrobided_line
+from grobid2json import teixml2json
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_DENYLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
+
+class MRExtractUnGrobided(MRJob):
+
+ # "ungrobided" TSV lines in; JSON status out
+ #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
+ #INPUT_PROTOCOL = mrjob.protocol.RawProtocol
+ INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
+ OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol
+
+ def configure_args(self):
+ super(MRExtractUnGrobided, self).configure_args()
+
+ self.add_passthru_arg('--hbase-table',
+ type=str,
+ default='wbgrp-journal-extract-0-qa',
+ help='HBase table to backfill into (must exist)')
+ self.add_passthru_arg('--hbase-host',
+ type=str,
+ default='localhost',
+ help='HBase thrift API host to connect to')
+ self.add_passthru_arg('--grobid-uri',
+ type=str,
+ default='http://localhost:8070',
+ help='URI of GROBID API Server')
+ self.add_passthru_arg('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+
+ def __init__(self, *args, **kwargs):
+ super(MRExtractUnGrobided, self).__init__(*args, **kwargs)
+ self.hb_table = None
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.mime_filter = ['application/pdf']
+ self.rstore = None
+
+ def grobid_process_fulltext(self, content):
+ r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
+ files={'input': content})
+ return r
+
+ def mapper_init(self):
+
+ if self.hb_table:
+ return
+
+ sentry_client.tags_context(dict(hbase_table=self.options.hbase_table))
+ try:
+ host = self.options.hbase_host
+ # TODO: make these configs accessible from... mrconf.cfg?
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.options.hbase_table)
+
+ def parse_ungrobided_line(self, raw_line):
+ """Line should be TSV and have non-null fields:
+
+ - key (string)
+ - f:c (string, json)
+ - file:mime (string)
+ - file:cdx (string, json)
+ """
+
+ if (raw_line.startswith(' ') or raw_line.startswith('#')):
+ return None, dict(status="invalid", reason="line prefix", input=raw_line)
+
+ info = parse_ungrobided_line(raw_line)
+ if info is None:
+ return None, dict(status="invalid", reason="ungrobided parse")
+
+ if info['file:mime'] not in self.mime_filter:
+ return None, dict(status="skip", reason="mimetype", mimetype=info['file:mime'])
+
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(info['file:cdx']['warc'].split('/')) != 2:
+ return None, dict(status="skip", reason="WARC path not petabox item/file", path=info['file:cdx']['warc'])
+
+ return info, None
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.options.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.options.warc_uri_prefix))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def extract(self, info):
+
+ # Fetch data from WARCs in petabox
+ original_content, status = self.fetch_warc_content(
+ info['file:cdx']['warc'],
+ info['file:cdx']['offset'],
+ info['file:cdx']['c_size'])
+ if status:
+ return None, status
+
+ info['file:size'] = len(original_content)
+
+ # Submit to GROBID
+ try:
+ grobid_response = self.grobid_process_fulltext(original_content)
+ except requests.exceptions.ConnectionError:
+ return None, dict(status="error", reason="connection to GROBID worker")
+
+ info['grobid0:status_code'] = grobid_response.status_code
+
+ # 4 MByte XML size limit; don't record GROBID status on this path
+ if len(grobid_response.content) > 4000000:
+ info['grobid0:status'] = {'status': 'oversize'}
+ return info, dict(status="oversize", reason="TEI response was too large")
+
+ if grobid_response.status_code != 200:
+ # response.text is .content decoded as utf-8
+ info['grobid0:status'] = dict(status='error', description=grobid_response.text)
+ return info, dict(status="error", reason="non-200 GROBID HTTP status",
+ extra=grobid_response.text)
+
+ info['grobid0:status'] = {'status': 'partial'}
+ info['grobid0:tei_xml'] = grobid_response.content
+
+ # Convert TEI XML to JSON
+ try:
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
+ except xml.etree.ElementTree.ParseError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
+ except ValueError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
+
+ tei_metadata = info['grobid0:tei_json'].copy()
+ for k in ('body', 'annex'):
+ # Remove fulltext (copywritted) content
+ tei_metadata.pop(k, None)
+ info['grobid0:metadata'] = tei_metadata
+
+ # Determine extraction "quality"
+ # TODO:
+
+ info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
+
+ return info, None
+
+ @sentry_client.capture_exceptions
+ def mapper(self, _, raw_line):
+ """
+ 1. parse filtered line
+ 2. fetch data from wayback
+ 3. submit to GROBID
+ 4. convert GROBID response to JSON (and metadata)
+ 6. determine "quality"
+ 6. push results to hbase
+ """
+
+ self.increment_counter('lines', 'total')
+
+ # Parse line and filter down
+ info, status = self.parse_ungrobided_line(raw_line)
+ if info is None:
+ self.increment_counter('lines', status['status'])
+ yield _, status
+ return
+ key = info['key']
+ if key in KEY_DENYLIST:
+ self.increment_counter('lines', 'denylist')
+ yield _, dict(status='denylist', key=key)
+ return
+
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+
+ # Do the extraction
+ info, status = self.extract(info)
+ if info is None:
+ self.increment_counter('lines', status['status'])
+ status['key'] = key
+ yield _, status
+ return
+ extraction_status = status
+
+ # Decide what to bother inserting back into HBase
+ # Basically, don't overwrite backfill fields.
+ grobid_status_code = info.get('grobid0:status_code', None)
+ for k in list(info.keys()):
+ if k in ('f:c', 'file:mime', 'file:cdx'):
+ info.pop(k)
+
+ # Convert fields to binary
+ for k in list(info.keys()):
+ if info[k] is None:
+ info.pop(k)
+ # NOTE: we're not actually sending these f:*, file:* keys...
+ elif k in ('f:c', 'file:cdx', 'grobid0:status', 'grobid0:tei_json',
+ 'grobid0:metadata'):
+ assert type(info[k]) == dict
+ info[k] = json.dumps(info[k], sort_keys=True, indent=None)
+ elif k in ('file:size', 'grobid0:status_code'):
+ # encode as int64 in network byte order
+ if info[k] != {} and info[k] != None:
+ info[k] = struct.pack('!q', info[k])
+
+ key = info.pop('key')
+ self.hb_table.put(key, info)
+ self.increment_counter('lines', 'success')
+
+ if extraction_status is not None:
+ yield _, dict(status="partial", key=key,
+ grobid_status_code=grobid_status_code,
+ reason=extraction_status['reason'])
+ else:
+ yield _, dict(status="success",
+ grobid_status_code=grobid_status_code, key=key,
+ extra=extraction_status)
+
+
+if __name__ == '__main__': # pragma: no cover
+ MRExtractUnGrobided.run()
diff --git a/mapreduce/grobid2json.py b/python_hadoop/grobid2json.py
index ca460f8..f3577b0 100755
--- a/mapreduce/grobid2json.py
+++ b/python_hadoop/grobid2json.py
@@ -31,9 +31,13 @@ import xml.etree.ElementTree as ET
ns = "http://www.tei-c.org/ns/1.0"
def all_authors(elem):
- names = [' '.join([e.findtext('./{%s}forename' % ns) or '', e.findtext('./{%s}surname' % ns) or '']).strip()
- for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns))]
- return [dict(name=n) for n in names]
+ names = []
+ for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)):
+ given_name = e.findtext('./{%s}forename' % ns) or None
+ surname = e.findtext('./{%s}surname' % ns) or None
+ full_name = '{} {}'.format(given_name or '', surname or '').strip()
+ names.append(dict(name=full_name, given_name=given_name, surname=surname))
+ return names
def journal_info(elem):
@@ -97,11 +101,15 @@ def teixml2json(content, encumbered=True):
header = tei.find('.//{%s}teiHeader' % ns)
if header is None:
raise ValueError("XML does not look like TEI format")
+ application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
+ info['grobid_version'] = application_tag.attrib['version']
+ info['grobid_timestamp'] = application_tag.attrib['when']
info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
info['journal'] = journal_info(header)
date = header.find('.//{%s}date[@type="published"]' % ns)
info['date'] = (date != None) and date.attrib.get('when')
+ info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
if info['doi']:
info['doi'] = info['doi'].lower()
@@ -131,7 +139,7 @@ def main(): # pragma no cover
usage="%(prog)s [options] <teifile>...")
parser.add_argument("--no-encumbered",
action="store_true",
- help="ignore errors loading individual WARC files")
+ help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
parser.add_argument("teifiles", nargs='+')
args = parser.parse_args()
diff --git a/python_hadoop/kafka_grobid_hbase.py b/python_hadoop/kafka_grobid_hbase.py
new file mode 100755
index 0000000..b52c386
--- /dev/null
+++ b/python_hadoop/kafka_grobid_hbase.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Kafka worker that consumes GROBID output from Kafka and pushes into HBase.
+
+Based on the ungrobided Hadoop job code.
+
+TODO: binary conversion in 'grobided' topic? shouldn't be, do that here, as well as all TEI extraction/parsing
+
+Requires:
+- requests
+- pykafka
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import sys
+import xml
+import json
+import raven
+import struct
+import requests
+import argparse
+import happybase
+import pykafka
+
+from common import parse_ungrobided_line
+from grobid2json import teixml2json
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_DENYLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
+
+class KafkaGrobidHbaseWorker:
+
+ def __init__(self, kafka_hosts, consume_topic, **kwargs):
+ self.consume_topic = consume_topic
+ self.consumer_group = kwargs.get('consumer_group', 'grobid-hbase-insert2')
+ self.kafka_hosts = kafka_hosts or 'localhost:9092'
+ self.hbase_host = kwargs['hbase_host']
+ self.hbase_table_name = kwargs['hbase_table']
+ self.hb_table = None # connection initialized in run()
+
+ def convert_tei(self, info):
+
+ # Convert TEI XML to JSON
+ try:
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
+ except xml.etree.ElementTree.ParseError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
+ except ValueError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
+
+ tei_metadata = info['grobid0:tei_json'].copy()
+ for k in ('body', 'annex'):
+ # Remove fulltext (copywritted) content
+ tei_metadata.pop(k, None)
+ info['grobid0:metadata'] = tei_metadata
+ return info, None
+
+ def do_work(self, raw_line):
+ """
+ 1. parse info JSON (with XML inside)
+ 2. do XML -> JSON conversions
+ 3. push to HBase
+
+ Returns: ???
+ """
+
+ # Parse line and filter down
+ info = json.loads(raw_line)
+ key = info['key']
+ if key in KEY_DENYLIST:
+ #self.increment_counter('lines', 'denylist')
+ return None, dict(status='denylist', key=key)
+
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+ print("inserting line to HBase: {}".format(key))
+
+ if info.get('grobid0:tei_xml'):
+ # Need to decode 'str' back in to 'bytes' (from JSON serialization)
+ info['grobid0:tei_xml'] = info['grobid0:tei_xml'].encode('utf-8')
+
+ if info.get('grobid0:status') == 200 and info.get('grobid0:tei_xml'):
+ info, status = self.convert_tei(info)
+
+ # Decide what to bother inserting back into HBase
+ # Basically, don't overwrite backfill fields.
+ grobid_status_code = info.get('grobid0:status_code', None)
+ for k in list(info.keys()):
+ if k in ('f:c', 'file:mime', 'file:cdx'):
+ info.pop(k)
+
+ # Convert fields to binary
+ for k in list(info.keys()):
+ if info[k] is None:
+ info.pop(k)
+ # NOTE: we're not actually sending these f:*, file:* keys...
+ elif k in ('f:c', 'file:cdx', 'grobid0:status', 'grobid0:tei_json',
+ 'grobid0:metadata'):
+ assert type(info[k]) == dict
+ info[k] = json.dumps(info[k], sort_keys=True, indent=None)
+ elif k in ('file:size', 'grobid0:status_code'):
+ # encode as int64 in network byte order
+ if info[k] != {} and info[k] != None:
+ info[k] = struct.pack('!q', info[k])
+
+ key = info.pop('key')
+ self.hb_table.put(key, info)
+ #self.increment_counter('lines', 'success')
+
+ return info, dict(status="success",
+ grobid_status_code=grobid_status_code, key=key)
+
+ def run(self):
+
+ # 1. start consumer (in managed/balanced fashion, with consumer group)
+ # 2. for each thingie, do the work; if success publish to kafka; either
+ # way... print? log?
+ # 3. repeat!
+
+ print("Starting grobid-hbase-worker...")
+ try:
+ host = self.hbase_host
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.hbase_table_name)
+ print("HBase inserting into {}".format(self.hbase_table_name))
+
+ kafka = pykafka.KafkaClient(hosts=self.kafka_hosts, broker_version="2.0.0")
+ consume_topic = kafka.topics[self.consume_topic]
+
+ sequential_failures = 0
+ consumer = consume_topic.get_balanced_consumer(
+ consumer_group=self.consumer_group,
+ managed=True,
+ auto_commit_enable=True,
+ # needed to avoid MessageSet decode errors
+ fetch_message_max_bytes=4*1024*1024,
+ # LATEST because best to miss processing than waste time re-process
+ auto_offset_reset=pykafka.common.OffsetType.LATEST,
+ compacted_topic=True)
+ print("Kafka consuming {} in group {}".format(
+ self.consume_topic,
+ self.consumer_group))
+
+ for msg in consumer:
+ #print("got a line! ")
+ grobid_output, status = self.do_work(msg.value.decode('utf-8'))
+ if grobid_output:
+ sequential_failures = 0
+ else:
+ sys.stderr.write("Failed to process GROBID extraction output: {}\n".format(status))
+ sequential_failures += 1
+ if sequential_failures > 20:
+ sys.stderr.write("too many failures in a row, bailing out\n")
+ sys.exit(-1)
+
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--kafka-env',
+ default="qa",
+ help="eg, 'qa' or 'prod'")
+ parser.add_argument('--consume-topic',
+ default=None,
+ help="Kafka topic to consume from")
+ parser.add_argument('--hbase-table',
+ type=str,
+ default='wbgrp-journal-extract-0-qa',
+ help='HBase table to backfill into (must exist)')
+ parser.add_argument('--hbase-host',
+ type=str,
+ default='localhost',
+ help='HBase thrift API host to connect to')
+ args = parser.parse_args()
+
+ if args.consume_topic is None:
+ args.consume_topic = "sandcrawler-{}.grobid-output".format(args.kafka_env)
+
+ worker = KafkaGrobidHbaseWorker(**args.__dict__)
+ worker.run()
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/mapreduce/mrjob.conf b/python_hadoop/mrjob.conf
index 6f36196..6f36196 100644
--- a/mapreduce/mrjob.conf
+++ b/python_hadoop/mrjob.conf
diff --git a/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml b/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
new file mode 100644
index 0000000..dbc8be5
--- /dev/null
+++ b/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
@@ -0,0 +1,2004 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <teiHeader xml:lang="en">
+ <encodingDesc>
+ <appInfo>
+ <application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+ <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+ </application>
+ </appInfo>
+ </encodingDesc>
+ <fileDesc>
+ <titleStmt>
+ <title level="a" type="main">DYNAMICS OF RAILWAY FREIGHT VEHICLES</title>
+ </titleStmt>
+ <publicationStmt>
+ <publisher/>
+ <availability status="unknown"><licence/></availability>
+ <date type="published" when="2015">2015</date>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Original Citation Iwnicki</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ <author>
+ <affiliation key="aff0">
+ <orgName type="institution">University of Huddersfield Repository</orgName>
+ </affiliation>
+ </author>
+ <title level="a" type="main">DYNAMICS OF RAILWAY FREIGHT VEHICLES</title>
+ </analytic>
+ <monogr>
+ <title level="m">Dynamics of railway freight vehicles. Vehicle System Dynamics. pp. 1­39. ISSN 0042­3114</title>
+ <imprint>
+ <date type="published" when="2015">2015</date>
+ </imprint>
+ </monogr>
+ <note>Dynamics of railway freight vehicles</note>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass>
+ <keywords>
+ <term>Freight wagon</term>
+ <term>Vehicle dynamics</term>
+ <term>Computer simulation</term>
+ <term>Rail Freight</term>
+ <term>Running Gear Design</term>
+ <term>Freight Bogies</term>
+ </keywords>
+ </textClass>
+ <abstract>
+ <p>This paper summarises the historical development of railway freight vehicles and how vehicle designers have tackled the difficult challenges of producing running gear which can accommodate the very high tare to laden mass of typical freight wagons whilst maintaining stable running at the maximum required speed and good curving performance. The most common current freight bogies are described in detail and recent improvements in techniques used to simulate the dynamic behaviour of railway vehicles are summarised and examples of how these have been used to improve freight vehicle dynamic behaviour are included. A number of recent developments and innovative components and sub systems are outlined and finally two new developments are presented in more detail: the LEILA bogie and the SUSTRAIL bogie.</p>
+ </abstract>
+ </profileDesc>
+ </teiHeader>
+ <text xml:lang="en">
+ <body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>DYNAMICS OF RAILWAY FREIGHT VEHICLES</head><p>Iwnicki S.D. 1 , Stichel S. <ref type="bibr" target="#b1">2</ref> , Orlova A. 3 , Hecht M. <ref type="bibr" target="#b3">4</ref> </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>From their inception railways have been predominant in the carriage of bulk goods and railway wagons have been designed to allow this to be effected efficiently on different types of railway infrastructure. In more recent times with changes in industrial needs and competition from road and air transport railways have carried an ever declining share of freight. Although there is some evidence in some countries that this trend has started to change recently due to road congestion there is still not yet a widespread evidence of a major modal shift from road to rail which politicians have indicated is desirable. For example the European Transport White paper 2011 <ref type="bibr" target="#b0">[1]</ref> sets a target for modal shift of 30% by 2030 and 50% by 2050 from road freight to other modes such as rail or waterborne transport for distances over 300 km.</p><p>The barriers to this increased modal shift from road to rail seem to be largely due to the requirements from modern shippers for shorter end-to-end times but even more the demand is for high reliability of service and for additional features such as tracking and tracing of shipments, security and temperature control. As Hecht <ref type="bibr" target="#b1">[2]</ref> points out the lower speeds for rail freight compared with passenger services are not mainly related to lower vehicle speed capability but are more due to the fact that freight trains often travel on lower speed lines or are held for passenger traffic to pass and due to complex and lengthy shunting and handling operations and motive power and crew changes.</p><p>Nevertheless if freight vehicle speeds and acceleration and braking capabilities could allow them to be fully integrated with passenger traffic this would bring a step change in end to end freight train speeds as well as overall system capacity. A key factor in obtaining this increased speed is to ensure that the dynamic performance of freight vehicles can allow safe and reliable operation on track with different levels of irregularities and support conditions. Running gear has evolved with the experience of operation on different railways and more recently the use of computer simulation tools and several standardised designs are now ubiquitous. Several research projects and teams have recently been trying to advance from this position using innovative designs adapted from passenger vehicles or using other novel techniques. The use of computer simulations is now established for design of running gear and is also becoming accepted as part of the vehicle acceptance processes in many countries.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">Early developments of freight wagons</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Background</head><p>Designers of freight vehicle running gear face many challenges but not least of these is the fact that the ratio of the laden to tare mass of a freight vehicle can be as much as 5:1 compared with a more manageable 1.5:1 for typical passenger vehicles. This effectively means that the suspension system has to be designed for two different vehicles (and every stage in between). A number of clever designs have evolved over the years and the most successful of these are now summarised.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">UIC double link</head><p>Freight wagons with link type suspensions have existed for more than 100 years, as can be seen in <ref type="figure" target="#fig_0">figure 1</ref>, and the link suspension is probably still the most common suspension type for two axle freight wagons in Europe today. As early as 1890 the principle of the link suspension was defined as a standard. A review of freight wagons with link suspension can be found in <ref type="bibr" target="#b2">[3]</ref>. After World War II the UIC double link suspension was defined as a standard <ref type="bibr" target="#b4">[5]</ref>. In the beginning of the 1980s a number of improvements were made. The axle load was increased to 22.5 tonnes and the parabolic leaf spring was introduced as standard component <ref type="bibr" target="#b5">[6]</ref>, <ref type="bibr" target="#b6">[7]</ref>. The UIC double link suspension in figure 2 mainly consists of three parts: Leaf springs, links and axle guards. The vehicle is connected to the parabolic or leaf spring by double links. The leaf spring rests on the axle box. This arrangement allows the axle box to move in both the longitudinal and lateral direction relative to the wagon body. The axle guard restricts the horizontal motion of the axle box. The principle of the suspension is that of a pendulum. In the longitudinal direction the suspension links are inclined, whereas in the lateral direction they are in a vertical plane when the vehicle body is in nominal position <ref type="bibr" target="#b0">[1]</ref>, <ref type="bibr" target="#b7">[8]</ref>, <ref type="bibr" target="#b8">[9]</ref>, <ref type="bibr" target="#b9">[10]</ref>. The characteristics of the double-link suspension are quite complex. The main components are shown in <ref type="figure">Figure 3</ref>. One of the main advantages of the link running gear is that it is simple, robust and cheap and also takes up little space in both lateral and vertical directions. Both stiffness and damping are provided by one system and are load dependent. The quasistatic curving performance of the single axle running gear with link suspension is good. For a typical two-axle freight wagon with a wheelbase of 9m on dry rails good steering performance down to 300 m curve radius can be achieved <ref type="bibr" target="#b9">[10]</ref>.</p><p>The running behaviour of two-axle freight wagons with link suspension can be rather poor mainly due to vehicle hunting. The amount of damping provided in the horizontal plane is often not sufficient. Additionally the characteristics of the suspension change during the life of the vehicle, due to suspension wear, and with the running conditions <ref type="bibr" target="#b9">[10]</ref>. The link suspension takes quite a lot of longitudinal space and is a poor isolator for sound and vibration.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3">Link suspension bogies</head><p>The leaf spring and link suspension of the single-axle running gear has also been used on bogies since about 1925 <ref type="bibr" target="#b0">[1]</ref>. More recently it has been standardised with for example bogie type 931 (figure 4), developed in the 1950s by Deutsche Bahn with a wheelbase of 2000 mm and a wheel diameter of 1000 mm. This bogie was developed to run at 100 km/h with an axle load of 20 t and was the first bogie standardised by UIC <ref type="bibr" target="#b5">[6]</ref>, <ref type="bibr" target="#b6">[7]</ref>. In the beginning of the 1980s DB bogie type 665 was introduced with new features like parabolic leaf springs, 22.5 t permissible axle load and shorter links as shown in figure 5 <ref type="bibr" target="#b6">[7]</ref>. The bogie frame is a welded steel design but in some places forged components are used. The frame is connected to parabolic or trapezoidal leaf springs, that rest on the axlebox, being connected by swing links. Nominally the suspension links are positioned in a longitudinal vertical plane and inclined in this plane. During vehicle operation the links swing in that plane and also laterally <ref type="bibr" target="#b0">[1]</ref>, <ref type="bibr" target="#b5">[6]</ref>, <ref type="bibr" target="#b6">[7]</ref>, <ref type="bibr" target="#b10">[11]</ref>. A spherical centre-pivot and two side bearers connect the bogie frame and the wagon body. The side bearers can be either rigid or vertically suspended and have three functions:</p><p>ï‚· to act as static support for the carbody. ï‚· to act as roll stiffness. ï‚· to provide friction damping between carbody and bogie The quasistatic curving performance of a bogie with link suspension is generally very good due to:</p><p>ï‚· the short wheelset distance in the bogie of 1.8 m. ï‚· the soft longitudinal primary suspension.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4">The Y25 Standard Bogie</head><p>Most railway vehicles have bogies or trucks which allow longer vehicles supported on two bogies while still keeping attack angles between wheels and rail in curves to reasonable levels. This arrangement also allows two stages of suspension with the 'primary' suspension between wheelset and bogie and secondary suspension between bogie and coach or wagon body. The primary suspension can isolate the bogie from short wavelength irregularities while the secondary suspension deals with the longer wavelength, lower frequency excitations.</p><p>As previously mentioned, a specific challenge for designers of freight vehicle running gear is the large difference between tare and laden vehicle mass. In the Y25 bogie progressive damping with vertical load is effected by the use of 'Lenoir links' which take part of the vertical load through an angled link and a pusher onto a vertical friction surface. This gives a level of damping which is broadly proportional to the vehicle mass. The Y25 bogie design originated in France in 1948 and was standardised by the ORE steering committee in 1967. It is shown in <ref type="figure">figure 6</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 6: A Y25 type bogie</head><p>The design has been hugely successful and Y25 type bogies are the most predominant freight bogie in Europe.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.5">'three-piece' Freight Bogies</head><p>The three-piece bogies were first developed in 1930s and seemed to originate simultaneously in the USA (Barber bogie) and the Soviet Union (Hanin bogie). Now the three-piece bogie and its more sophisticated descendents are the most common suspension for freight wagons across North and South Americas, CIS countries, China, Africa, India and Australia. Maximum axle loads range between 7 and 36 t. The most common standards for three-piece bogies are AAR <ref type="bibr" target="#b12">[13]</ref> for 1435 mm gauge and GOST <ref type="bibr" target="#b13">[14]</ref> for 1520 mm gauge. A review of three-piece bogies can be found in <ref type="bibr" target="#b14">[15]</ref>.</p><p>The Russian model 18-100 bogie shown in <ref type="figure">figure 7</ref> is a good example of an early type of three-piece bogie. The term 'three-piece' refers to the design of the bogie frame which consists of three interconnected parts: two side frames and one bolster. The frame parts are usually cast.</p><p>The bogie is equipped with central suspension between the side frames and the bolster that consists of a set of springs and wedge friction dampers working in vertical and lateral direction and keeping the frame square. The side frames with their flat surfaces rest on the axle-boxes (or bearing adapters). The size of the opening in the side frame provides clearances in longitudinal and lateral direction within which the axle-box moves resisted by dry friction forces. The car body rests on the flat center bowl, its roll motion relative to the bolster is limited by side bearers which are usually stiff vertical stops including clearance when the wagon body is in the central position. The three-piece bogie is a very robust design with the advantage of being low cost in production, operation and repair. The following items are considered as disadvantages of traditional three-piece bogie and attempts have been made to address these in its further developments <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b15">[16]</ref>, <ref type="bibr" target="#b16">[17]</ref>:</p><p>ï‚· Limited critical speed of the empty wagon )with sway oscillation of car body being the major loss of stability mode); ï‚· Wheel flange contact in curves produced by warping between side frames and bolster; ï‚· Side frames adding to the unsprung mass and thus increasing track impact on short wavelength irregularities; ï‚· Deterioration of ride performance with wear of friction wedges and other friction surfaces.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3">Computer simulation</head><p>Computer simulation of freight vehicles is not at all as common as for passenger vehicles. Since many of the European freight vehicles are standardized very little new development has been carried out and the manufacturers do in general not perform a simulation analysis of the running behaviour of freight wagon. However, in several research groups at universities and research institutes and at some consulting companies computer simulation of freight vehicles is now performed.</p><p>Since manufacturers do not usually build simulation models of freight vehicles themselves one of the main challenges in modelling a freight wagon is to obtain all the input parameters required. Another aspect is that most suspension elements are strongly non-linear and in many cases even mathematically non-smooth. This makes it very difficult to build up simulation models that provide good results compared to measurement results. Some of the phenomena observed during simulation of freight vehicles will be discussed below.</p><p>Further, as described in Section 3.1, the characteristics of the suspension elements can vary during operation due to wear or environmental effects such as for example surface contamination changing the friction coefficient in sliding surfaces.</p><p>The main purpose of simulation studies of freight vehicles is very often a stability analysis (see Section 3.2) or an investigation of the curving behaviour of the freight wagon (see Section 3.3). Since the axle loads of freight wagons are usually high, the investigation of wheel or rail wear and rolling contact fatigue is often the primary reason for a simulation study in curves.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Suspension components</head><p>The suspension in most freight vehicles relies on friction damping. Friction elements are low cost, require little maintenance and are usually load dependent. This means that the level of friction damping changes with axle load, an important feature in freight wagons due to the high tare to laden ratio already mentioned. Surveys of modelling of friction components in freight wagon can be found for example in <ref type="bibr" target="#b17">[18]</ref>- <ref type="bibr" target="#b21">[22]</ref>. Papers <ref type="bibr" target="#b17">[18]</ref> and <ref type="bibr" target="#b18">[19]</ref> are general reviews of rail vehicle suspension components, while <ref type="bibr" target="#b19">[20]</ref> is focused on freight vehicles and also discusses issues such as stability and curving of freight vehicles. Papers <ref type="bibr" target="#b20">[21]</ref> and <ref type="bibr" target="#b21">[22]</ref> are focussed on modelling friction wedges of three-piece bogies. Also in the proceedings from the Euromech 500 colloquium <ref type="bibr" target="#b22">[23]</ref> many valuable contributions on the topic of non-smooth suspension elements can be found. Various arrangements of suspension elements to simulate vehicle suspensions are documented in <ref type="bibr" target="#b23">[24]</ref>, <ref type="bibr" target="#b24">[25]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.1">Friction damping</head><p>In most freight vehicle simulation models friction is modelled as dry Coulomb friction, where the friction force is proportional to the normal load. The friction coefficient is assumed to be constant, see force-deflection curve in <ref type="figure">figure 8</ref>, left. The disadvantage of the Coulomb model is that it is non-smooth, i.e. multi-valued and non-differentiable. Another way to model friction is with a linear spring in series with a friction slider as in <ref type="figure" target="#fig_8">figure 9</ref> with the resulting force-displacement characteristic in <ref type="figure">figure 8</ref>, right. Since most friction damper arrangements have a finite flexibility, such models could also be regarded as more realistic. Note, however that the model with a spring in series is still non-smooth. To avoid the difficulties mentioned above regularization methods are often applied, see for example <ref type="bibr" target="#b25">[26]</ref>, <ref type="bibr" target="#b26">[27]</ref> and <ref type="bibr" target="#b27">[28]</ref>. Piotrowski developed a non-smooth rheological model <ref type="bibr" target="#b28">[29]</ref>, <ref type="bibr" target="#b29">[30]</ref>, which employs the notion of the differential succession involving a contingent derivative of the non- smooth, multi-valued characteristics of Coulomb friction. Tan and Rogers <ref type="bibr" target="#b30">[31]</ref> proposed equivalent viscous damping models to avoid the numerical problems of Coulomb friction. They claim that this substitution works very well for cases where sliding motions predominate.</p><p>In many running gear arrangements two-dimensional friction elements are needed, e.g. in the Y25 and in the three-piece bogie. In these designs motions in two directions tangential to the friction surfaces are possible. Two-dimensional Coulomb friction models can be found e.g. in <ref type="bibr" target="#b31">[32]</ref>, <ref type="bibr" target="#b32">[33]</ref>.</p><p>Another phenomenon that is important to take into account is stochastic excitations that smooth the dry friction damping. Also mid frequency excitation generated in the wheel rail contact - often called dither - can smoothen dry friction and therefore have a significant influence on the simulation results, see for example <ref type="bibr" target="#b29">[30]</ref>, <ref type="bibr" target="#b32">[33]</ref>.</p><p>True and Asmund <ref type="bibr" target="#b32">[33]</ref> investigated the effects of dry friction in the suspension of a simple freight vehicle. They used a relatively simple model of dry friction and found that the stable behaviour for the system with friction exhibited a laterally oscillating motion which makes the system sensitive to external periodic forcing. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2">Wagons with link suspension</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.1">Basic model of leaf spring and link suspension</head><p>Leaf springs are often used as vertical suspension. In multibody simulation models they are usually regarded as rigid in both the longitudinal and lateral directions. For dynamic displacements around a static equilibrium position leaf springs are characterized by a relatively high stiffness for small displacements and a significantly lower stiffness for larger displacement, (figure 10). Leaf springs are described in the ORE reports <ref type="bibr" target="#b33">[34]</ref>, <ref type="bibr" target="#b34">[35]</ref>. Since link suspensions show very similar characteristics they are often modelled in a similar way to leaf springs, at least for the lateral link behaviour. The initial higher stiffness k 1 in leaf springs is caused by friction, i.e. the leaves of a leaf spring stick together for small displacements and start to slide on each other for larger displacements. In the same way the link rolls in the end bearing as long as there is no sliding in the contact area. The lower stiffness k 2 is the value for sliding in the leaf spring or the so called pendulum stiffness of a link. The force F d determines the amount of damping in the hysteresis. A commonly used model to represent the two different stiffness values with the hysteresis is to use a linear spring and a friction element in series, in parallel with another linear spring, as shown in <ref type="figure" target="#fig_0">figure 11</ref>. It should be taken into account that the characteristics of leaf springs vary due to wear in running or deterioration or lubrication state.</p><p>The three parameters in the model described above can be derived from measurements. This model, however, is simplified since the shape of the hysteresis curve is usually rounded as shown in <ref type="figure" target="#fig_0">figure 10</ref>. Measurement results and more detailed descriptions of link suspensions can be found in <ref type="bibr" target="#b33">[34]</ref>- <ref type="bibr" target="#b47">[48]</ref>. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.2">Advanced simulation models</head><p>For lateral displacements of a double-link all four joints are assumed to start to slide at the same time; therefore the model in <ref type="figure" target="#fig_0">figure 11</ref> is sufficient. In the longitudinal direction, however, it is more likely that the joints start to slide at different displacements as shown e.g. by Piotrowski <ref type="bibr" target="#b28">[29]</ref>. He uses a set of four sliders and spring elements with different breakout forces in parallel to describe these characteristics. Also in a model used by Stiepel several elements in parallel are used <ref type="bibr" target="#b43">[44]</ref>.</p><p>To give a better representation of the rounded shape of the hysteresis curves, Fancher developed a model for truck leaf springs <ref type="bibr" target="#b44">[45]</ref>, <ref type="bibr" target="#b45">[46]</ref> using exponential expressions. Jönsson <ref type="bibr" target="#b41">[42]</ref> used a similar approach, where the total force over the suspension component is separated into piece-wise elastic and friction forces. The model is used for both leaf springs and double-links.</p><p>Another possibility to describe hysteresis with rounded shape for link suspensions is to use rolling contact theory, which has been proposed by <ref type="bibr">Piotrowski [33]</ref>. Based on the slip velocity the creepage in the contact is calculated.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3">Modelling the three-piece bogie</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.1">Models of the central suspension</head><p>Most of the research in modelling three-piece bogies, such as <ref type="bibr" target="#b20">[21]</ref>, <ref type="bibr" target="#b21">[22]</ref>, is focussed on the central suspension element of the three-piece truck that provides damping with friction wedges. Early models of friction wedge suspensions recognized only vertical load-dependent friction force, later models included two-dimensional friction in the vertical and lateral directions <ref type="bibr" target="#b45">[46]</ref>, <ref type="bibr" target="#b49">[50]</ref>.</p><p>The first approach to account for possible angular and longitudinal displacements of bolster relative to the side frames is to introduce warping and longitudinal nonlinear resistance characteristics into the model, as it is done in <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b16">[17]</ref>. In such case the wedges are not modelled as separate bodies, but the equivalent force against displacement characteristics are introduced accounting for wedge parameters, such as inclination angle, width of the vertical surface, width of the inclined surface, friction coefficients on inclined and vertical surfaces, etc.</p><p>The second approach to account for all possible degrees of freedom between side frame and bolster is to introduce multiple contact points mapped along the edges of the wedge with two-dimensional friction force elements in each of them. Such an approach was used by Ballew et al <ref type="bibr" target="#b45">[46]</ref>, it is implemented in simulation tools such as VAMPIRE <ref type="bibr" target="#b51">[52]</ref>, and the Universal Mechanism software <ref type="bibr" target="#b51">[52]</ref>. Numerous contact elements require an efficient numerical simulation algorithm to be implemented into the software that provides fast solution to resulting stiff system of equations, such as the one developed by Pogorelov <ref type="bibr" target="#b56">[57]</ref>. The wedges are treated as massless. Contact type models allow the study of such complicated phenomenon as uneven distribution of contact forces over the wedge surfaces, implementation of resilient pads on wedge surfaces, jamming and wedging <ref type="bibr" target="#b53">[54]</ref>. In paper <ref type="bibr" target="#b55">[56]</ref> the authors included the mass of the wedge into consideration to study its dynamic properties.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.2">Models of the axle to side frame interaction</head><p>In the first approach similar to friction wedges the axle to side frame interaction can be described by nonlinear equivalent characteristics as in <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b16">[17]</ref>. The dry friction interaction between the axle box crown and the side frame pedestal is modelled by two dimensional dry friction element in parallel with another nonlinear element that describes bumpstops in longitudinal and lateral dimension. A typical characteristic of the bumpstop element is presented in <ref type="figure" target="#fig_0">figure 12</ref>. To improve numerical integration the transition from clearance to bumpstop is often smoothed.</p><p>If the interaction between the crown and pedestal is a flat surface, then its width can result in roll stiffness that is produced by gravity. Such stiffness can be introduced into the model depending on the axle load. The second approach is to introduce multiple contact points on the edges of the crown with two-dimensional friction elements in them. The bumpstops are then also the contact elements between the axle box or adapter and the stops in the side frame jaws. Such approach is used in <ref type="bibr" target="#b56">[57]</ref> as well as in Universal Mechanism software <ref type="bibr" target="#b51">[52]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.3">Models of the centre bowl and side bearers</head><p>The same approaches can be applied to models of the centre bowl to centre plate interaction and at the side bearers.</p><p>In the first approach, see <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b16">[17]</ref>, centre plate to centre bowl interaction works simultaneously as one dimensional yaw friction and nonlinear roll and pitch torque with soft characteristics as shown in <ref type="figure" target="#fig_0">figure 13</ref>. Knowing the clearance in the side bearers the nonlinear roll characteristic can be linearized. The second approach is to introduce multiple contact points on the edges of the centre plate with two-dimensional friction elements in them. The interaction with the centre bowl rim is then also the contact elements. Such an approach is used in <ref type="bibr" target="#b56">[57]</ref> as well as in Universal Mechanism software <ref type="bibr" target="#b51">[52]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2">Stability</head><p>Freight vehicles in most cases operate at much lower speeds than passenger vehicles. Typical running speeds are at around 100 km/h. This suggests that stability investigations are not as important as for faster passenger vehicles. On the other hand freight vehicles often are much less damped than passenger vehicles and stability investigations are therefore necessary. Several of the wagon types introduced above can - in unfavourable running conditions - show significant hunting behaviour at speeds as low as 70 km/h.</p><p>In a bogie vehicle basically three types of hunting motion can arise:</p><p>ï‚· Wheelset hunting where one wheelset performs the hunting motion.</p><formula xml:id="formula_0">ïŠ M 0 ïŠ M ïŠ ïŠ c Mg ï„ ïŠ</formula><p>ï‚· Bogie hunting where a whole bogie is taking over the hunting motion. ï‚· Carbody hunting where the carbody performs a yaw motion and the two bogies mainly follow the carbody with lateral motions, i.e. the whole vehicle takes over the hunting motion.</p><p>Carbody hunting is often a type of resonance phenomenon, where the Klingel hunting frequency given mainly by vehicle speed and conicity in the contact coincides with the yaw eigenfrequency of the carbody.</p><p>Hunting motion with a non-zero limit cycle depends on the wheel-rail geometry, the suspension and the masses and inertias of the vehicle. Since the mass and inertia, and in most cases the suspension stiffness and damping of the freight wagon will significantly change with load, the type of hunting motion observed usually differs between an empty and a loaded wagon. Since the stiffness values between axlebox and bogie frame (in a bogie vehicle) are lower in an unloaded vehicle, the risk for wheelset or bogie hunting is higher. In loaded vehicles, vehicle hunting can often be observed. Since the frequency of wheelset hunting is usually low (typically between 1 and 2 Hz) the wheel rail forces induced are relatively low and in most cases below the limit values stipulated in standards. Therefore, the vehicle design in reality allows for the carbody instability to happen in some conditions. Otherwise the suspension needs to be so stiff that the curving performance would suffer, and the amount of wear and RCF would increase significantly. The risk of carbody hunting can vary with the type of load since this can influence the yaw eigenfrequency of the carbody.</p><p>Due to the significant inherent non-linearity and non-smoothness of the suspension elements linearization of the models is usually not realistic. It is therefore necessary to perform time steppig integration with the full non-linear model. The task is in general to find the non-linear critical speed v B of the wagon as can be seen in the generic bifurcation diagram in figure 14.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 14: Generic bifurcation diagram</head><p>In complex models it is very difficult to find the exact critical speed, for example with a path following method <ref type="bibr" target="#b57">[58]</ref>. Therefore other engineering methods are used. One possibility that has been suggested e.g. by <ref type="bibr">Polach [59]</ref> is to excite the vehicle with an initial disturbance that can either be deterministic or stochastic. After the initial disturbance the vehicle is run on ideal smooth track. If the oscillation vanishes the vehicle is regarded as stable. The simulations have to be repeated with increasing speed until the oscillations do not disappear. In that case the non-linear critical speed v b ( <ref type="figure" target="#fig_0">figure 15</ref>) is reached. A risk with this method is that the initial disturbance is not high enough to initiate a limit cycle oscillation and that the critical speed detected is higher than the real non-linear critical speed.</p><p>Another method to detect the non-linear critical speed is start the simulations at a very high speed to be sure that the vehicle has reached the non-zero attractor (limit cycle). Then the speed is continuously reduced until the limit cycle behaviour disappears. Polach also describes this method. It has been used for example by Boronenko et al <ref type="bibr" target="#b14">[15]</ref> to tune the suspension of three-piece bogies.</p><p>A similar method, shown in <ref type="figure" target="#fig_0">figure 15</ref>, is suggested in <ref type="bibr" target="#b59">[60]</ref> to determine the so-called non-linear critical speed. The difference to the method introduced above is that the speed is not reduced continuously but in discrete steps as suggested by True <ref type="bibr" target="#b98">[98]</ref>. <ref type="figure" target="#fig_0">Figure 16</ref> shows the bifurcation diagram for a loaded two-axle vehicle calculated with this method. It can be observed that only the stable branches of the bifurcation diagram can be determined, not the unstable part. The zero solution is also possible at least up to a speed of 120 km/h (bold solid line). This was simulated using the procedure above, starting from low speed and increasing the speed stepwise. Hoffman also investigated the stability of a two-axle wagon with link suspension <ref type="bibr" target="#b42">[43]</ref>, <ref type="bibr" target="#b60">[61]</ref>. He uses the link model developed by Piotrowski <ref type="bibr" target="#b28">[29]</ref>. The leaf springs model is based on Fancher et al <ref type="bibr" target="#b45">[46]</ref>. Gialleonardo et al <ref type="bibr" target="#b61">[62]</ref> extended this type of stability analysis for a two-axle wagon with link suspension on curved track. As can be seen in <ref type="figure" target="#fig_0">figure 18</ref>. the leading wheelset (y lw )</p><p>shows much smaller oscillation amplitudes than the trailing wheelset (y tw ) and the carbody. This is because the outer wheel of the leading wheelset experiences flange contact. In general the results show the presence of large periodic oscillations in narrow curves at commercial operating speeds. It is also shown in the paper that the coupling forces between wagon assemblies significantly reduce the oscillation amplitudes. Zhai et al <ref type="bibr" target="#b62">[63]</ref> extended the stability analysis for a freight wagon with three-piece bogies to also include a visoelastic track structure. The stability analysis is performed according to the methodology suggested by Polach, which is explained above. The authors found that a lower critical hunting speed is obtained on elastic track compared with the rigid track case. The difference in the critical hunting speeds between the elastic track base and the rigid track base is 4.4% for the loaded freight car.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3">Curving</head><p>As indicated above simulations of the running behaviour of freight wagons in curves are often performed to investigate the risk of wheel wear and Rolling Contact Fatigue (RCF).</p><p>For passenger vehicles curving simulations are often performed on ideal track, i.e. the stochastic track irregularities are neglected. Authors are in this case interested in the quasistatic behaviour of the vehicle, i.e. the mean wheelset attack angles or the mean energy dissipation in the contact points. For freight vehicles with non-linear and non- smooth suspension this can lead to significant mistakes as shown in the example from Jönsson <ref type="bibr" target="#b41">[42]</ref>. On ideal track the friction surfaces might stick together and force the wheelset into a more unfavourable position. Track irregularities help to get relative motion in the friction surfaces, which usually leads to better - and more realistic - steering behaviour of the vehicle. As seen in <ref type="figure" target="#fig_0">figure 19</ref>, the energy dissipation as a measure for the amount of wear or RCF, is much lower when simulating running with track irregularities. In one of their numerous studies on three-piece bogies Boronenko et al <ref type="bibr" target="#b14">[15]</ref> investigate the reason for excessive flange wear in some of the Russian wagons. One conclusion is that the main reason for flange wear is the unstable behaviour of the bogies in curves (rutting mode) <ref type="bibr" target="#b15">[16]</ref>, when the bogie is flanging with a two-point contact situation instead of negotiating the curve using the wheel conicity. The flanging is the result of bogie warping, which increases the angle of attack compared to a radial position. In the article a number of different designs are discussed. Among others it is concluded that a bogie design with radial arms significantly reduces the angle of attack and the wear number in curves, see <ref type="figure" target="#fig_1">figure 20</ref>. Berghuvud [64] investigated the curving behaviour of different types of three-piece bogie with and without braking. He concluded that the influence of braking on the curving behaviour is complex. Braking can have a positive effect on the angle of attack of the wheelsets in a curve since it helps to overcome the static friction in the primary suspension. It can also increase the angle of attack if large longitudinal forces push the wheelset longitudinally towards the limit of the play and thus lock the wheelset in an unfavourable position.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.1">Vehicle Resistance</head><p>Radially steering bogies do not only reduce flange wear in curves but also reduce the required traction energy. The inner leading wheel is less affected and the trailing wheelset has much smaller values. With radial steering, ( <ref type="figure" target="#fig_1">figure 22</ref>) the leading axle also has very small creepages. This results in lower wear and running resistance. As a result on track with tight curves more than 20% of the overall running resistance can be reduced with similar levels of energy saving <ref type="bibr" target="#b65">[66]</ref>.</p><p>Of course radial steering may affect running stability on straight track. Therefore bogie designs with cross anchors such as the TVP 2007 or the Leila bogie have an advantage over individual radial steering axles as in the swing hanger bogie.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.2">Influence of curving on wheel and rail damage phenomena</head><p>As mentioned in the introduction to this section the curving performance of a freight wagon is very important for the level of wheel and rail damage. This means in turn that the vehicle track interaction in curves determines to a large extent the maintenance cost for the whole system. In <ref type="bibr" target="#b65">[66]</ref> Fröhling discusses the influence of, among others, bogie design, bogie maintenance and the wheel/rail interface in heavy haul operation on different damage phenomena on wheels and rails. In a later publication Fergusson et al <ref type="bibr" target="#b67">[67]</ref> present an analysis of wheel wear as a function of the relationship between the lateral and longitudinal primary suspension stiffness and the coefficient of friction at the centre plate between the wagon body and the bolster to minimise the wheel wear rate of a self-steering three-piece bogie without compromising vehicle stability. Simulation results indicate that wheel wear is theoretically the lowest for low lateral and longitudinal primary suspension stiffness and no friction at the centre plate. Casanueva et al <ref type="bibr" target="#b68">[68]</ref> extend the wear prediction methodology for freight wagons to also include switches and crossings. It is concluded that wear on some parts of the wheel profile can only be explained with running through switches.</p><p>Tunna and Urban <ref type="bibr" target="#b69">[69]</ref> carried out a parametric study to quantify the effects of various freight vehicle parameters on the generation of RCF. Three different freight suspensions wer considered: an enhanced three-piece bogie, a rigid-frame bogie with primary suspension, and a two-axle vehicle with leaf springs. Simulations were performed for track curvature ranging from 400 to 10 000 m. To judge the generation of RCF the Tgamma model from Burstow <ref type="bibr" target="#b70">[70]</ref> was used. It is stated that parameters that clearly need to be considered when evaluating rail surface damage are curve distribution, track quality, conicity, vehicle type and loading state of the wagon. Since several parameters are line dependent it is concluded that a route based analysis is necessary.</p><p>In <ref type="bibr" target="#b71">[71]</ref> a simulation model of an iron ore wagon with three-piece bogie is developed to investigate the risk of RCF on the Swedish and Norwegian iron ore line. 43 load cases with various conditions were used as inputs. The risk for RCF was estimated with the so-called shakedown map. The wear number, which is the product of creepages and creep forces, was calculated to estimate where initiated cracks develop or are worn away. In <ref type="figure" target="#fig_1">figure 23</ref> areas on the wheel profile with high risk of RCF can be seen. The area on the wheel tread coincides very well with field observations of RCF but the areas in the flange root and on the flange did not show RCF damage. It can be concluded that the energy dissipation is high enough to wear away initiated cracks. It seems that simulation of the curving behaviour of freight wagons can provide valuable information about the risk of wheel damage for specific operating conditions.</p><p>In <ref type="bibr" target="#b71">[71]</ref> a simulation by Dukkipati and Dong examine the effects of a freight wagon running over a dipped joint. In a very recent paper Wang and Gao investigate the wheel wear of a freight vehicle with three-piece bogie in curves <ref type="bibr" target="#b99">[99]</ref>. It is shown that wear is most severe on the outer leading wheel in the bogie. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.4">Parameter identification</head><p>The establishment of the correct parameters for use in computer models is clearly of great importance. Some parameters can easily be measured or provided by the manufacturers but others are very difficult to establish. Ren et el <ref type="bibr" target="#b74">[74]</ref> demonstrate the use of a test rig with a sliding plate underneath one wheelset to establish key parameters. The sliding plate is moved with actuators and forces measured to allow the lateral, shear and warp stiffness to be established as well as the friction characteristics of the bogie.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4">Modern Developments</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1">The British Rail HSF Bogies</head><p>Wickens and colleagues at British Rail Research carried out theoretical and practical work aimed at understanding the dynamic performance of two axle freight vehicles <ref type="bibr" target="#b75">[75]</ref>, <ref type="bibr" target="#b76">[76]</ref>. The aim was to increase the operating speed of freight vehicles and reduce the rate of derailments. A series of experimental two axle vehicles were constructed to confirm the results of the analysis. They included coil springs and viscous dampers and longitudinal rods to control yaw motion and were initially tested on a full size roller rig.</p><p>Computer simulations of curving and stability were carried out with various damper configurations and on-track tests of several prototypes were undertaken</p><p>The result of this work was the prototype 'HSFV.4' high speed freight vehicle with viscous damping ( <ref type="figure" target="#fig_1">figure 24)</ref> which was tested at speeds of up to 120 km/h and proved to run without hunting for a wide range of effective conicity values. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2">The Unitruck running gear</head><p>The Unitruck single-axle running gear with lateral "swing hangers" was first developed for the American market and in the 1990's adjusted to suit European conditions. Vehicles with Unitruck running gear <ref type="bibr" target="#b76">[76]</ref> are today used both in North America and Europe. They have only one stage suspension, which also includes friction damping. As in the Y25 bogie, the vertical force in the primary suspension is used to preload the different friction components via an inclined surface. <ref type="figure" target="#fig_1">Figure 25</ref> left shows the wedge element, which is in series with one of the coil springs and in contact with the carbody via an inclined friction surface; the vertical surface in contact with the saddle is also a friction surface. Newer designs have substituted the inclined friction surface by a roller (figure 25 left) <ref type="bibr" target="#b77">[77]</ref>, thus enabling the displacement in the longitudinal direction, but reducing longitudinal damping. Also, adding a coupling plate in the centre of the coil springs increases longitudinal stiffness <ref type="figure" target="#fig_1">(Figure 25</ref> right), which improves critical speed compared to the running gear with rollers and classic coil springs. The 'Swing Motion' bogie ( <ref type="figure" target="#fig_1">figure 26</ref>) is a variant of the three-piece freight bogie and was originally developed for heavy haul operations in North America. In the Swing Motion design an additional cross member or transom is included which connects the two side frames together via pivots at the base of the secondary spring pack. The bolster still sits on the top of the spring packs and is damped through friction wedges. A pivot between the axle boxes and the side frames is also included so that the side frames can pivot or swing to accommodate lateral motion of the bolster. The swing motion gives increased lateral stability at speeds up to 176 km/h and is claimed to reduce wheel and rail wear, reduce rolling resistance and forces on track and vehicle body compared with standard three-piece bogies.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4">The 'LTF' bogie</head><p>In the 1980s British Rail Research in the UK developed a novel, track friendly bogie using passenger vehicle technology. The LTF25bogie is shown in <ref type="figure" target="#fig_1">figure 27</ref> and is described in <ref type="bibr" target="#b79">[79]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 27: The 'LTF25' bogie</head><p>The LTF25 bogie was specifically designed to reduce dynamic track forces and as part of this effort was made to reduce the unsprung mass. Small wheels (813 mm diameter) were used and inside axle boxes giving a 30% reduction in wheelset mass although this necessitated the use of on-board hotbox detectors.</p><p>Primary suspension is through steel coil springs and secondary suspension is through rubber spring elements and hydraulic dampers.</p><p>The high cost of the LTF25 bogie and concerns about axle fatigue with inboard axle boxes militated against its adoption but Powell Duffryn produced a modified version of the bogies known as the TF25 bogie (shown in <ref type="figure" target="#fig_1">figure 28</ref>) which has achieved considerable production success. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.5">The 'Gigabox' bogie</head><p>The 'Gigabox'bogie uses pedestal units containing progressive rubber springs with integral hydraulic damping as shown in figures 29 and 30). The system was developed by ContiTec and SKF and is claimed not to require maintenance for up to 1million km and to provide good noise and vibration isolation. A reduction of up to 20% in lateral forces is claimed as well as a 2 dB reduction in noise. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.6">The Double Rubber Ring Spring (DRRS) bogie</head><p>Originally designed by Talbot the DRRS bogie uses double rubber torroidal ring springs with load proportional friction damping as shown in <ref type="figure" target="#fig_0">figure 31</ref>. Container wagons with DRRS bogies entered service with the DB 'Inter Cargo Express- System'. Maximum axle-load ranges from 22.5 t at 100 km/h to 18.375 t at 160 km/h. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.7">Advances in three-piece bogies</head><p>The major drivers for advances of AAR three-piece bogies were tightening ride performance and track impact standards, such as M-1001 <ref type="bibr" target="#b79">[79]</ref> and M-976 <ref type="bibr" target="#b80">[80]</ref>, since 2000.</p><p>An overview of improvements in the suspensions is given in <ref type="bibr" target="#b82">[81]</ref>. Suspension springs tend to increase the deflection. Using higher control springs under the wedges increases friction under the empty wagon thus providing its better stability, and makes damping less dependent on the wear of wedges themselves. Different height of the inner and outer springs allows having lower lateral stiffness of the suspension under the empty wagon, thus improving its running performance. Using the set of 9 double springs per each side of the bogie increases warping resistance.</p><p>The innovative designs of the wedges are shown in <ref type="figure" target="#fig_1">figure 32</ref>. Both designs aim to increasing the warping resistance of the bogie. The split wedge consists of two symmetric parts inclined towards each other and interacts with the spatial insert in the bolster pocket. In the spatial wedge the surfaces are inclined in the other direction and they are wider than the vertical surface, which gives the same effect. In the interaction between the side frame and the wheelset axle various elastic components are introduced to reduce unsprung mass as well as to reduce resistance to wheelset displacement in plane, thus reducing the lateral track forces. Some of the designs of elastic shear pads are shown in <ref type="figure">figure 33</ref>. The rigid side bearings with clearances have transformed in modern three-piece bogies into constant contact side bearings, incorporating the elastic element compressed by the weight of the car body, <ref type="bibr" target="#b83">[82]</ref>. Examples of the design are shown in <ref type="figure" target="#fig_3">figure 34</ref>. Constant contact side bearings provide yaw damping for the bogies on straight track, as well as additional car body roll resistance for better curving performance. The rollers positioned with a clearance provide rigid bumpstop that limits the elastic element deflection without increasing the yaw resistance. There are several devices used to increase warping stiffness of three-piece bogies, the most common of which is using cross-braces between the side frames shown in <ref type="figure" target="#fig_4">figure 35</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Cap</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Elastic element</head><p>Cage Wear resistant element</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Insert</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Roller</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Cap</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Elastic element</head><p>Cage 1 - top brace; 2 - bottom brace; 3 - bolt; 4 - washer; 5 - nut; 6 - fastening unit; 7 - rings; 8 - locking plate; 9 - washer; 10 - bolt; 11 - elastic pad; 12 - safety wire; 13, 14 - bracket; 15, 16, 17 - plate; 18 - key <ref type="figure" target="#fig_4">Figure 35</ref> Cross-braces between side frames.</p><p>Using the concept of shear and bending stiffness of the bogie Scheffel <ref type="bibr" target="#b84">[83]</ref>, developed several novel designs of three-piece bogies (figure 36). At first the horizontal motion of the frame is decoupled from the wheelsets by horizontally soft primary suspension. Then the axle boxes are interconnected through sub-frames or arms by elastic elements that support their radial position in curves, but resist in-phase yaw <ref type="bibr" target="#b85">[84]</ref>. Scheffel bogies having the axle load of 32 t provide mileage between wheel turning of up to 1.5 million kilometres thus proving the high efficiency of the design to reduce track forces.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.8">The Lenoir pusher spring</head><p>Various alternatives to the double Lenoir linkage have been explored with the aim of providing reduced longitudinal stiffness at low cost. One example is the 'Lenoir pusher spring' which consists of a plunger and washer springs mounted opposite the Lenoir pusher ( <ref type="figure" target="#fig_32">figure 37</ref>). This allows more longitudinal motion than the conventional Piotrowski <ref type="bibr" target="#b87">[86]</ref> reports how this arrangement has been shown to give good performance in a prototype vehicle with significant reductions in wheel wear.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.9">The RC25NT Bogie</head><p>Eisenbahn Laufwerke Halle (Germany) has developed the RC25NT self-steering three- piece bogie with direct inter-axle linkages which was presented at the Innotrans exhibition in 2010) <ref type="bibr">[87]</ref> ( <ref type="figure" target="#fig_2">figure 38</ref>). The bogie has horizontally soft rubber bushes in the primary suspension and flexicoil dual rate springs with friction damping in the secondary suspension. The bogie is equipped with disk brakes. The aim of the development was to build a bogie capable of stable running up to 120 km/h, keeping low noise criteria and negotiating curves with minimum of wear. The bogie is designed to replace the Y25 type bogie without changes to the wagon body.</p><p>Simulations have shown that the RC25NT provides better stability on straight track than the Y25 (figure 39) and less wheel and rail wear in curves ( <ref type="figure" target="#fig_3">figure 40</ref>). The bogie was tested according to the UIC 518 standard in Sweden in 2010 for speeds up to 160 km/h. The RC25NT demonstrates that direct inter-axle linkages can allow freight car bogies to run at 120 km/h with proper steering and low wear in curves. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.10">The 'LEILA' Bogie</head><p>The LEILA bogie ('LEIchtes und LärmArmes GüterwagenDrehGestell' with the meaning of light and low noise freight bogie) is a passive radial steering bogie with a maximum axle load of 22.5 t and was developed between 2000 and 2005 during a German and Swiss research project <ref type="bibr" target="#b88">[88]</ref>. The Institute of Rail Vehicles of the Technische Universität Berlin was one of the involved partner. The aim to develop this bogie was:</p><p> to reduce the noise emissions of freight wagons;  to reduce the mass of a bogie to be under 4 t and  to reduce significantly wear and running resistance.</p><p>In addition:</p><p> the reliability and availability of freight wagons;  transparency in the transport chain;  the active and passive safety of the freight traffic and;  the transport velocity should be similarly increased <ref type="bibr" target="#b89">[89]</ref>. The primary layer consists of rubber springs and the load dependent stiffness characteristics are separated in vertical and horizontal working components. The bogie has passive radial steering technology of the wheelsets. Wheelsets are able to rotate about the vertical axis without any external energy but only by the roll radius difference between the inner and outer wheel. Both wheelsets are connected with cross anchors; mounted on opposite axle boxes. The secondary layer is defined UIC centre of pivot and side bearer (latter guarantees the exchangeability to Y25 bogies). In addition, the centre of pivot has an elastically bearing using a secondary rubber spring. The LEILA bogie prototype was examined during various field tests where it demonstrated its advantages compared to a Y25 bogie. The noise emissions were reduced up to 18 dB(A) compared to a Y25 bogie with cast iron brake blocks and up to 8 dB(A) compared to a Y25 bogie with composite blocks (k- blocks). But the bogie failed at that time to enter the market. During the very good ongoing homologation process the producer of the bogie decided to stop the production of new freight wagons and bogies. Therefore the homologation was stopped and not finished just for commercial reasons. Right now as more and more EMUs are produced with inner bearings it is expected that the acceptability of inner bearing bogies with the advantages less weight and lower forces at the axles in curves will be more acceptable. As with the Leila bogie the cross anchor couples the two axles so that they turn with a phase shift of 180°. This stabilizes the radial steering effect even when the wheel-rail contact is not perfect and the second very important effect is dynamic stabilisation without yaw dampers for high speed straight track running. On curvy track significant flange and running surface wear reduction and also significant reduction of the running resistance occur.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.12">The SUSTRAIL Bogie</head><p>The aim of the SUSTRAIL project is to promote modal shift of freight in Europe from road to rail. The SUSTRAIL project intends to provide the approach, structure, and technical content to support this modal shift through improvements in the railway freight system including innovations in rolling stock in track components. The project includes workpackages focused on market research, vehicles, infrastructure and assessment of cost benefits. The work described here is part of workpackage 3: 'The freight vehicle of the future'.</p><p>The main scientific and technological innovations being considered for the SUSTRAIL freight vehicle are:</p><p> The development of advanced vehicle dynamics concepts based on new wheel profiles and improvements in suspension design responding to the needs of a mixed traffic railway;  Developments in the traction and braking systems for high speed low impact freight operation;  Novel designs and materials for lightweight high performance freight wagon body vehicles and bogie structures;  Advanced condition based predictive maintenance tools for critical components of both railway vehicles and the track;  Identification of performance based design principles to move towards the zero maintenance ideal for the vehicle/track system.</p><p>Partners in the project have carried out a technology review to identify the potential innovative technologies to meet the above requirements and the results have been ranked and two concept vehicles are being designed. The 'Conventional' vehicle will use optimised existing technology and a demonstrator for this is being built as part of the project. The 'Futuristic' vehicle will utilise technology which has not yet been proven in the railway field but has potential to make greater improvements.</p><p>Simulations have been carried out of the dynamic behaviour of the concept design vehicles running on typical track in tare, part laden and fully laden cases. In line with the target of a 50% reduction in lateral forces on the track and stable running at 140 km/h a suspension using double Lenoir linkages, longitudinal linkages between axle boxes and centre pivot suspension has been selected. Computer simulation has been used to optimise the suspension and to select suitable parameters for the various components. Assessment of the results is based on:</p><p> Stability: stable running on typical European track at the design speed of 140km/h must be ensured and ride quality (vertical lateral and longitudinal accelerations experienced by the goods transported) will be assessed.  Reduced track forces: track geometrical deterioration (ballast settlement and horizontal level, alignment and buckling), rail surface damage (wear, rolling contact fatigue -RCF) and track components damage (sleeper cracking, rail pad deterioration, rail fatigue, fastening deterioration) will all be assessed.</p><p>A benchmark vehicle has been selected based on a Y25 bogie and flat bed wagon and has been used to allow quantification of the benefits of the new design.</p><p>A number of radical innovations were considered during the technology review stage of the project but it was decided that the use of double Lenoir link primary suspension as in the Y37 series of bogies (figure 44), would be investigated. The double Lenoir link suspension provides much lower longitudinal primary stiffness while still utilising standard components and methods which are well established within the railway industry. this work A model of the SUSTRAIL vehicle was set up with double Lenoir links using the computer simulation tool Gensys and the influence of variations in the suspension parameters on the critical speed of the wagon was simulated. Straight track was used for this simulation and an initial lateral disturbance was introduced followed by ideal track with no irregularities. Axle load is 22.5 t, wheel profile is S1002 and rail profile UIC60 inclined at 1:40. The wheel rail coefficient of friction is set at 0.35. The wagon speed was reduced from an initial 170 km/h and critical speed assumed to have been reached when the track shifting force (∑ ) drops below 2.5 kN. An example is shown in <ref type="figure" target="#fig_3">figure 46</ref>. Further variations were carried out and the effect of the friction coefficient and stiffness within the suspension on the maximum contact force is shown in <ref type="figure" target="#fig_3">figure 49</ref>. <ref type="figure" target="#fig_3">Figure 49</ref>: The effect of friction coefficient and spring stiffness on the contact force It can be seen that the maximum vertical contact forces tends to increase with the damping and with the spring stiffness. In order to improve the running behavior of the SUSTRAIL vehicle it was decided to assess the benefit of linkages provividing longitudinal stiffness between the axleboxes using a radial arm. A radial arm designed by <ref type="bibr">Scheffel [90]</ref> was studied previously in the Infra-Radial project <ref type="bibr" target="#b91">[91]</ref> which aimed to develop a bogie for heavy haul vehicles (axle loads over 25T) with reduced life cycle costs. Tests using the radial arm with four different primary suspension types showed good results with stable running and radially aligned wheelsets in curves. Wear of the wheels was seen to reduce significantly <ref type="bibr" target="#b91">[91]</ref>. In the work reported here simulation was carried out using MEDYNA for the SUSTRAIL vehicle with double Lenoir links and modified radial arms. Simulations have confirmed that the radial arm should provide lateral stiffness between the wheelsets and optimised parameters have been defined. A prototype of the SUSTRAIL freight vehicle is being constructed by REMARUL engineering. In addition to Vertical coils spring stiffness <ref type="bibr">[%]</ref> the innovative suspension described in this paper the SUSTRAIL vehicle will have disk brakes with an electronic control system. The bogie design is shown in <ref type="figure" target="#fig_4">figure 50</ref>. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5">Longitudinal dynamics</head><p>The longitudinal dynamic behaviour of railway vehicles is often neglected as the link to the vehicle track interaction is generally not significant and it has been common to assume that all vehicles of the same type in a train will behave identically. In heavy haul freight applications however where long trains are common the effect of longitudinal dynamics can become significant. In <ref type="bibr" target="#b71">[71]</ref> for example Qi et al model the longitudinal behaviour of a long train including traction and braking and the coupling between vehicles. Belforte et al <ref type="bibr" target="#b93">[93]</ref> also analyse the effects of severe traction and braking forces on longitudinal dynamics.</p><p>There are several areas where longitudinal dynamics can interact with the general vehicle dynamics. These include:</p><p>ï‚· Wheel unloading on curves due to lateral components of coupler forces; ï‚· Wagon body pitch due to coupler impact forces and ï‚· Bogie pitch due to coupler impact forces Cole <ref type="bibr" target="#b94">[94]</ref> describes how these effects can be assessed in different cases and McClanachan <ref type="bibr" target="#b95">[95]</ref> and El Sibaie <ref type="bibr" target="#b96">[96]</ref> present results of computer simulations including coupler models.</p><p>Freight vehicles have to provide satisfactory performance at low cost in tare and laden condition on varying track quality. This has resulted in several standard designs including the Y25 and the three-piece bogie. These designs use friction damping proportional to the vehicle mass to provide good dynamic performance at all loading conditions. In recent years vehicle designers have tried to improve on the dynamic performance of freight wagons and the use of computer tools have helped to overcome the compromise between good curving performance and stability at higher speeds. This has resulted in a number of innovative designs with demonstrable performance improvements but it is notable that few of these have yet to make significant impact in the worldwide freight train fleets.</p><p>A key reason for this lack of adoption is probably the innately conservative nature of the railway industry. Of course this often has a sound basis in, for example, the benefit of using standard components which allow effective maintenance of widely dispersed fleets of vehicles but in order to allow the benefits of the innovative techniques and designs summarised in this paper it is time to reconsider the design of freight vehicles. This could allow increases in speed with lower impact on track and environment and a resulting step change in performance of the railway system. One encouraging sign is the establishment in some countries of track access charging which benefits the use of vehicles with 'track friendly' suspension. Together with emerging legislation and growing pressures on system capacity it is likely that the demand for freight vehicles with higher dynamic performance will climb rapidly.</p><p>Rail freight only can contribute in mitigating the environmental impacts of transportation if the knowledge and todays experience for innovative products is used. Some basic thoughts can be found here and in <ref type="bibr" target="#b97">[97]</ref>. Optimising performance through the development of innovative products is to be planned and procured carefully. This paper has demonstrated that freight vehicle designers have innovative designs of running gear and computer simulation tools ready for this challenge.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Freight wagon from Kockums Sweden, built in 1882 [4].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: UIC double link suspension.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 3 : Double link suspension [ 8 ]. Parts of double link (a), assembled double link (b) and mounted double link (c).</head><label>38</label><figDesc>Figure 3: Double link suspension [8]. Parts of double link (a), assembled double link (b) and mounted double link (c).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 4 :</head><label>4</label><figDesc>Figure 4:. DB bogie Type 931 [7].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 5 :</head><label>5</label><figDesc>Figure 5: DB bogie Type 665 [7].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 7 : Model 18- 100 bogie: a - general view, b - central suspension scheme, c - primary 'suspension' scheme ( 1 - wheelset; 2 - side frame; 3 - bolster; 4 - braking leverage; 5 - central pivot; 6 - rigid side bearings; 7 - suspension springs; 8 - friction wedge; 9 -</head><label>7100123456789</label><figDesc>Figure 7: Model 18-100 bogie: a-general view, b-central suspension scheme, cprimary 'suspension' scheme (1-wheelset; 2-side frame; 3-bolster; 4-braking leverage; 5-central pivot; 6-rigid side bearings; 7-suspension springs; 8friction wedge; 9-axle-box)</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_7"><head>Figure 8 : Force-displacement curve of Coulomb friction model (left) and Coulomb model with spring in series as in [ 29 ]</head><label>829</label><figDesc>Figure 8: Force-displacement curve of Coulomb friction model (left) and Coulomb model with spring in series as in [29] (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_8"><head>Figure 9 : Friction element with spring in series.</head><label>9</label><figDesc>Figure 9: Friction element with spring in series.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_10"><head>Figure 10 : Typical force-displacement diagram of leaf spring/link suspension. Example of curve for small displacements around static equilibrium.</head><label>10</label><figDesc>Figure 10: Typical force-displacement diagram of leaf spring/link suspension. Example of curve for small displacements around static equilibrium.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_11"><head>Figure 11 : Model for leaf spring or link suspension as used for example by KTH [ 40 ]. See figure 10 for definition of k1 and k2.</head><label>114010</label><figDesc>Figure 11: Model for leaf spring or link suspension as used for example by KTH [40]. See figure 10 for definition of k1 and k2.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_12"><head>Figure 12 Model for bumpstop element (∆ - clearance, - stiffness of the bumpstop)</head><label>12</label><figDesc>Figure 12 Model for bumpstop element (∆-clearance,-stiffness of the bumpstop)</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_13"><head>Figure 13 Model for center plate element (∆ - distance between center plate edge and car body center of gravity, - roll angle, - weight of the car body per one center plate, - roll torque, - equivalent roll stiffness)</head><label>13</label><figDesc>Figure 13 Model for center plate element (∆-distance between center plate edge and car body center of gravity,-roll angle,-weight of the car body per one center plate,-roll torque,-equivalent roll stiffness)</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_14"><head>Figure 15 : Procedure to find the non-linear critical speed [ 60 ].</head><label>1560</label><figDesc>Figure 15: Procedure to find the non-linear critical speed [60].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_15"><head>Figure 16 : Bifurcation diagram for a loaded two-axle vehicle with link suspension ( 21 t axle load) Wheel: somewhat worn S1002. Rail: Nominal UIC60 [ 42 ].</head><label>162142</label><figDesc>Figure 16: Bifurcation diagram for a loaded two-axle vehicle with link suspension (21 t axle load) Wheel: somewhat worn S1002. Rail: Nominal UIC60 [42].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_16"><head>Figure 17 .</head><label>17</label><figDesc>shows attractors for two different types of freight wagons. The results are in principle quite similar to those in figure 16.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_17"><head>Figure 17 : Attractors for the Hbbills 311 and the G69 freight wagons. The model with the measured characteristics of the UIC links is damping less than the model with the cylindrical characteristics. The hunting attractor exists even for low speeds [61].</head><label>17</label><figDesc>Figure 17: Attractors for the Hbbills 311 and the G69 freight wagons. The model with the measured characteristics of the UIC links is damping less than the model with the cylindrical characteristics. The hunting attractor exists even for low speeds [61].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_18"><head>Figure 18 Map of lateral oscillation amplitude in single wagon as function of curve radius [62].</head><label>18</label><figDesc>Figure 18 Map of lateral oscillation amplitude in single wagon as function of curve radius [62].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_19"><head>Figure 19 : Energy dissipation. Comparative simulation with and without track irregularities. Two-axle vehicle with link suspension. 22 . 5 t axle load [ 42 ].</head><label>1922542</label><figDesc>Figure 19: Energy dissipation. Comparative simulation with and without track irregularities. Two-axle vehicle with link suspension. 22.5 t axle load [42].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_20"><head>Figure 20 : Angle of attack (a) and wear number (b) for wagons in a curve of 200 m radius at 60 km/h with 18- 100 bogies respectively bogies with radial arm upgrade [ 15 ].</head><label>2010015</label><figDesc>Figure 20: Angle of attack (a) and wear number (b) for wagons in a curve of 200 m radius at 60 km/h with 18-100 bogies respectively bogies with radial arm upgrade [15].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_21"><head>Figure 21 : Y25 bogie running in a 300m curve Wheel slip lateral and longitudinal at all wheel rail contact points, 90 t tank car with a Y25-Bogie in a 300 m curve, speed 80 km/h, lateral acceleration aq= 0, 67 m/s², s1002 Wheel profile, UIC 60E1, 1 Figure 22 : Radially steered bogie running in a 300 m curve Wheel slip lateral and longitudinal at all wheel- rail contact points, 90 t tank car with a Leila-Bogie in a 300 m curve, speed 80 km/h, lateral acceleration aq= 0, 67 m/s², s1002 Wheel profile, UIC 60E1, 1</head><label>21903006712230090300671</label><figDesc>Figure 21: Y25 bogie running in a 300m curve Wheel slip lateral and longitudinal at all wheel rail contact points, 90 t tank car with a Y25-Bogie in a 300 m curve, speed 80 km/h, lateral acceleration aq= 0,67 m/s², s1002 Wheel profile, UIC 60E1, 1:40 rail inclination</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_22"><head>Figure 23 : Calculated RCF positions of the wheel with corresponding average wear number. The far-left line is also reported as the observed approximate location for RCF initiation.</head><label>23</label><figDesc>Figure 23: Calculated RCF positions of the wheel with corresponding average wear number. The far-left line is also reported as the observed approximate location for RCF initiation.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_23"><head>Figure 24 :</head><label>24</label><figDesc>Figure 24: The HSFV.1 experimental freight wagon</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_24"><head>Figure 25 :</head><label>25</label><figDesc>Figure 25: Unitruck running gear (left) and modifications for improving curving behaviour (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_25"><head>Figure 28 :</head><label>28</label><figDesc>Figure 28: The TF25 bogie</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_26"><head>Figure 29 : The Gigabox bogie Figure 30 :</head><label>2930</label><figDesc>Figure 29: The Gigabox bogie</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_27"><head>Figure 31 : The DRRS bogie and cross section</head><label>31</label><figDesc>Figure 31: The DRRS bogie and cross section</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_28"><head>Figure 32 :</head><label>32</label><figDesc>Figure 32: Split wedge (left) and spatial wedge (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_30"><head>Figure 33 : Adapter Plus ® by Amsted (left) and layered shear pad in Russian 18- 9800 bogie (right).</head><label>339800</label><figDesc>Figure 33: Adapter Plus ® by Amsted (left) and layered shear pad in Russian 189800 bogie (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_31"><head>Figure 34 : Constant contact side bearing with springs (left) and with non-metal element and roller (right).</head><label>34</label><figDesc>Figure 34: Constant contact side bearing with springs (left) and with non-metal element and roller (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_32"><head>Figure 37 :</head><label>37</label><figDesc>Figure 37: The Lenoir pusher spring</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_33"><head>Figure 38 : RC25NT bogie with direct inter-axle linkages Figure 39 : Simulation stability results for RC25NT bogie vs. Y25 bogie (upper figure = high conicity, lower figure = low conicity)Figure 40 : Simulated wear number for RC25NT bogie vs. Y25 bogie</head><label>383940</label><figDesc>Figure 38: RC25NT bogie with direct inter-axle linkages</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_34"><head>Figure 41</head><label>41</label><figDesc>Figure 41 and 42 show the main components of this bogie. Compared to the standard bogies such as Y25, the LEILA bogie has inner bearings. The resulting better force flow lead to a weight reduction of the bogie frame and wheelset resulting in an overall weight reduction of 750 kg per bogie compared to Y25 bogie. At the web of the wheels (diameter: 920 mm), disc brakes are mounted. The primary layer consists of rubber springs and the load dependent stiffness characteristics are separated in vertical and horizontal working components. The bogie has passive radial steering technology of the wheelsets. Wheelsets are able to rotate about the vertical axis without any external energy but only by the roll radius difference between the inner and outer wheel. Both wheelsets are connected with cross anchors; mounted on opposite axle boxes. The secondary layer is defined UIC centre of pivot and side bearer (latter guarantees the exchangeability to Y25 bogies). In addition, the centre of pivot has an elastically bearing using a secondary rubber spring. The LEILA bogie prototype was examined during various field tests where it demonstrated its advantages compared to a Y25 bogie. The noise emissions were reduced up to 18 dB(A) compared to a Y25 bogie with cast iron brake blocks and up to 8 dB(A) compared to a Y25 bogie with composite blocks (kblocks). But the bogie failed at that time to enter the market. During the very good ongoing homologation process the producer of the bogie decided to stop the production of new freight wagons and bogies. Therefore the homologation was stopped and not finished just for commercial reasons. Right now as more and more EMUs are produced with inner bearings it is expected that the acceptability of inner bearing bogies with the advantages less weight and lower forces at the axles in curves will be more acceptable.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_35"><head>Figure 41 : Main components of LEILA bogie [ 88 ] Figure 42 : Leila Bogie from beneath with the inner bearings, cross anchor and wheel disc brakes clearly visible 4 .</head><label>4188424</label><figDesc>Figure 41: Main components of LEILA bogie [88]</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_36"><head>Figure 43 : TVP2007 bogie by Tatravagónka a.s.</head><label>43</label><figDesc>Figure 43: TVP2007 bogie by Tatravagónka a.s.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_37"><head>Figure 44 :</head><label>44</label><figDesc>Figure 44: A suspension with double Lenoir links</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_38"><head>Figure 46 : A sample simulation results showing the establishment of the critical speed for the SUSTRAIL vehicle with double Lenoir links</head><label>46</label><figDesc>Figure 46: A sample simulation results showing the establishment of the critical speed for the SUSTRAIL vehicle with double Lenoir links</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_39"><head>Figure 47 :</head><label>47</label><figDesc>Figure 47: The effect of Lenoir link angle, length and friction coefficient on the critical speed of the SUSTRAIL vehicle</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_40"><head>Figure 48 : Maximum vertical force on the rail for the SUSTRAIL vehicle running at 120 km/h</head><label>48</label><figDesc>Figure 48: Maximum vertical force on the rail for the SUSTRAIL vehicle running at 120 km/h</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_41"><head>Figure 50 :</head><label>50</label><figDesc>Figure 50: The prototype SUSTRAIL freight bogie</figDesc></figure>
+
+ <note place="foot" n="1">-side frame; 2-bolster; 3-wheelset; 4-primary suspension; 5-elastic connection between sub-frames Figure 36: Scheffel HS bogie (left) and bogie retrofitted with Radial Arm design (right).</note>
+ </body>
+ <back>
+ <div type="references">
+
+ <listBibl>
+
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">European freight vehicle running gear: today&apos;s position and future demands</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">M</forename><surname>Hecht</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">PartF, Journal of Rail and Rapid Transit</title>
+ <imprint>
+ <biblScope unit="volume">215</biblScope>
+ <biblScope unit="page" from="1" to="11" />
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+ <note>Proc. Of the Inst. Of Mech. Engrs.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">White Paper-Roadmap to a Single European Transport Area-Towards a competitive and resource efficient transport system</title>
+ </analytic>
+ <monogr>
+ <title level="j">European Commission</title>
+ <imprint>
+ <biblScope unit="volume">144</biblScope>
+ <date type="published" when="2011-03-28" />
+ </imprint>
+ </monogr>
+ <note>COM</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">Freight Wagon Running Gear-a review, KTH Railway Division</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2002" />
+ <pubPlace>Stockholm</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <monogr>
+ <title level="m">Swedish: Järnvägsfordon från Kockums), Kockums industrier</title>
+ <meeting><address><addrLine>Malmö, Sweden, Pamphlet</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1995" />
+ </imprint>
+ </monogr>
+ <note>Railway vehicles from Kockums</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m">UIC Code 517. Wagons-Suspension gear (Standardisation)</title>
+ <imprint/>
+ </monogr>
+ <note>6th edition 1-7-79. Reprint 1-1-89. incorporating 8 amendments</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <monogr>
+ <title level="m" type="main">Laufwerkskonstruktion und Erhöhung der Radsatzlasten im Güterverkehr. ZEV-Glasers Annalen 107</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">T</forename><surname>Madeyski</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1983" />
+ <biblScope unit="page" from="139" to="147" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">L</forename><surname>Müller</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><surname>Niedermeyer</surname></persName>
+ </author>
+ <title level="m">Weiterentwickelte Güterwagendrehgestelle der Deutschen Bundesbahn für 22.5 t Radsatzlast-wieder nach dem Lenkachsenprinzip. ZEV-Glasers Annalen</title>
+ <imprint>
+ <date type="published" when="1987" />
+ <biblScope unit="volume">111</biblScope>
+ <biblScope unit="page" from="188" to="196" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Dynamic analysis of a freight car with standard UIC single-axle running gear</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Lange</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">KTH Railway Technology</title>
+ <imprint>
+ <biblScope unit="page">34</biblScope>
+ <date type="published" when="1996" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <monogr>
+ <title level="m" type="main">Running behavior of railway freight wagon with single axle running gear</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <idno>1998:40</idno>
+ <imprint>
+ <date type="published" when="1998" />
+ <publisher>KTH</publisher>
+ </imprint>
+ <respStmt>
+ <orgName>Division of Railway Technology</orgName>
+ </respStmt>
+ </monogr>
+<note type="report_type">TRITA-FKT Report</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">How to improve the running behavior of freight wagons with UIC-link suspension. Vehicle System Dynamics Supplement 33</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1999" />
+ <biblScope unit="page" from="394" to="405" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <monogr>
+ <title level="m" type="main">Running behavior of freight wagons with link bogies. TRITA-FKT Report 1999:12, Division of Railway Technology</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1999" />
+ <publisher>KTH</publisher>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Neue Erkenntnisse über das Verschleissverhalten von Güterwagendrehgestellen, ZEV Glasers Annalen 111</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><surname>Specht</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1987" />
+ <biblScope unit="page" from="271" to="280" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Association of American Railroads. Manual of standards and recommended practices. Section D. Trucks and truck details</title>
+ <imprint>
+ <date type="published" when="2010" />
+ <biblScope unit="volume">130</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <analytic>
+ <title level="a" type="main">Bogies two-axle three-piece for freight wagons of 1520 mm gauge railways. General technical specifications</title>
+ </analytic>
+ <monogr>
+ <title level="m">GOST 9246-2013</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Refining the wedge friction damper of three-piece freight bogies</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Romen</forename><surname>Yu</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="page" from="445" to="455" />
+ <date type="published" when="2008" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Influence of construction schemes and parameters of three-piece freight bogies on wagon stability, ride and curving qualities</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Y</forename><surname>Boronenko</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Rudakova</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">44</biblScope>
+ <biblScope unit="page" from="402" to="414" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+ <note>Supplement</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Identification of parameters for spatial wedge system implemented in freight bogie design</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 10th mini-conference on Vehicle System Dynamics, Identification and Anomalies. Ed. I. Zobory. ISBN 978 963 420 968</title>
+ <meeting>the 10th mini-conference on Vehicle System Dynamics, Identification and Anomalies. Ed. I. Zobory. ISBN 978 963 420 968<address><addrLine>Budapest</addrLine></address></meeting>
+ <imprint>
+ <publisher>Komaromi Nyomda es Kiado Kft</publisher>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="245" to="252" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">A review of modeling methods for railway vehicle suspension components. Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><forename type="middle">M</forename><surname>Eickhoff</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">R</forename><surname>Evans</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">J</forename><surname>Minnis</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">24</biblScope>
+ <biblScope unit="page" from="469" to="496" />
+ <date type="published" when="1995" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Modelling of suspension components in a rail vehicle dynamics context</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><surname>Bruni S</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Vinolas</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Berg</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">O</forename><surname>Polach</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Stichel</forename><forename type="middle">S</forename></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">49</biblScope>
+ <biblScope unit="issue">7</biblScope>
+ <biblScope unit="page" from="1021" to="1072" />
+ <date type="published" when="2011" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Modeling and Simulation of Freight Wagon with Special attention to the Prediction of Track Damage</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Casanueva</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Hossein</forename><surname>Nia</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Railway Technology</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Modelling friction wedges, Part I: The state-of-the-art</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">E</forename><surname>Klauser</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of IMECE04 2004 ASME International Mechanical Engineering Congress &amp; Exposition</title>
+ <meeting>IMECE04 2004 ASME International Mechanical Engineering Congress &amp; Exposition<address><addrLine>Anaheim (CA</addrLine></address></meeting>
+ <imprint>
+ <publisher>American Society of Mechanical Engineering</publisher>
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">A review of dynamics modelling of friction wedge suspensions. Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Q</forename><surname>Wu</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Cole</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Spiryagin</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Q</forename><surname>Sun</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">52</biblScope>
+ <biblScope unit="issue">11</biblScope>
+ <biblScope unit="page" from="1389" to="1415" />
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <monogr>
+ <title level="m" type="main">Non-smooth Problems in Vehicle Systems Dynamics</title>
+ <imprint>
+ <date type="published" when="2010" />
+ <publisher>Springer</publisher>
+ <pubPlace>Berlin Heidelberg</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rail Vehicle Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Anderson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Berg</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">KTH Royal Institute of Technology</title>
+ <imprint>
+ <date type="published" when="2013" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b24">
+ <monogr>
+ <title level="m" type="main">Simulation. In: Iwnicki, editor, Handbook of Railway Vehicle Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">O</forename><surname>Polach</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Berg</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Iwnicki</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <publisher>Taylor &amp; Francis</publisher>
+ <biblScope unit="page" from="359" to="421" />
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b25">
+ <analytic>
+ <title level="a" type="main">Modelling of wedge dampers in the presence of two-dimensional dry friction</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">F</forename><surname>Xia</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Vehicle system dynamics</title>
+ <meeting><address><addrLine>Lingby, Denmark</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2003" />
+ <biblScope unit="volume">37</biblScope>
+ <biblScope unit="page" from="565" to="578" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b26">
+ <analytic>
+ <title level="a" type="main">Modeling and Dynamics of Friction Wedge Dampers in Railroad Freight Trucks</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">B</forename><surname>Kaiser</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">P</forename><surname>Cusumano</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">F</forename><surname>Gardner</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="55" to="82" />
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b27">
+ <analytic>
+ <title level="a" type="main">Multibody simulation of a freight bogie with friction dampers</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">N</forename><surname>Bosso</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Gugliotta</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Soma</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Railroad Conference</title>
+ <imprint>
+ <publisher>ASME/IEEE Joint</publisher>
+ <date type="published" when="2002" />
+ <biblScope unit="page" from="47" to="56" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b28">
+ <analytic>
+ <title level="a" type="main">Model of the UIC link suspension for freight wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Archive of Applied Mechanics</title>
+ <imprint>
+ <biblScope unit="volume">73</biblScope>
+ <biblScope unit="issue">7</biblScope>
+ <biblScope unit="page" from="517" to="532" />
+ <date type="published" when="2003-12" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b29">
+ <monogr>
+ <title level="m" type="main">Smoothing dry friction damping by dither generated in rolling contact of wheel and rail and its influence on ride dynamics of freight wagons, NVSD</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2010-06" />
+ <biblScope unit="volume">48</biblScope>
+ <biblScope unit="page" from="675" to="703" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b30">
+ <analytic>
+ <title level="a" type="main">Equivalent viscous damping models of coulomb friction in multi-degree-of-freedom vibration systems</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">X</forename><surname>Tan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">J</forename><surname>Rogers</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Journal of Sound and Vibration</title>
+ <imprint>
+ <biblScope unit="volume">185</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="33" to="50" />
+ <date type="published" when="1995-08" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b31">
+ <analytic>
+ <title level="a" type="main">Modelling of a two-dimensional Coulomb friction oscillator</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">F</forename><surname>Xia</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Journal of Sound and Vibration</title>
+ <imprint>
+ <biblScope unit="volume">265</biblScope>
+ <biblScope unit="issue">5</biblScope>
+ <biblScope unit="page" from="1063" to="1074" />
+ <date type="published" when="2003-08" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b32">
+ <analytic>
+ <title level="a" type="main">A substitute model of two-dimensional dry friction exposed to dither generated by rolling contact of wheel and rail</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">50</biblScope>
+ <biblScope unit="issue">10</biblScope>
+ <biblScope unit="page" from="1495" to="1514" />
+ <date type="published" when="2012" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b33">
+ <monogr>
+ <title level="m" type="main">The dynamics of a railway freight wagon wheelset with dry friction damping Vehicle System Dynamics 44 supplement</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>True</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Asmund</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <biblScope unit="page" from="853" to="861" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b34">
+ <analytic>
+ <title level="a" type="main">Flexibility of trapezoidal springs</title>
+ </analytic>
+ <monogr>
+ <title level="j">ORE</title>
+ <imprint>
+ <date type="published" when="1986" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b35">
+ <monogr>
+ <title level="m" type="main">ORE, Parabolic springs for wagons (design, calculation, treatment)</title>
+ <imprint>
+ <date type="published" when="1988" />
+ <pubPlace>Utrecht, 43</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b36">
+ <analytic>
+ <title level="a" type="main">Improvement of the running stability of existing RIV wagons required to run under any loading conditions at speeds of 80 km/h</title>
+ </analytic>
+ <monogr>
+ <title level="j">ORE</title>
+ <imprint>
+ <date type="published" when="1967" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b37">
+ <monogr>
+ <title level="m" type="main">ORE: Etude de la stabilité transversale d&apos;un véhicule ferroviaire à deux essieux</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><forename type="middle">R</forename><surname>Joly</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1974" />
+ <pubPlace>Utrecht</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b38">
+ <monogr>
+ <title level="m" type="main">Computer simulation of freight vehicles with leaf springs</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">B</forename><surname>Ayasse</surname></persName>
+ </author>
+ <idno>INRETS/RE- 01-046-FR</idno>
+ <imprint>
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+<note type="report_type">Technical report</note>
+ <note>a comparison between different packages, INRETS</note>
+</biblStruct>
+
+<biblStruct xml:id="b39">
+ <analytic>
+ <title level="a" type="main">Modelling and laboratory investigations on freight wagon link suspensions with respect to vehicle-track dynamic interaction</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">KTH</title>
+ <imprint>
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+<note type="report_type">Licenciate Thesis</note>
+</biblStruct>
+
+<biblStruct xml:id="b40">
+ <analytic>
+ <title level="a" type="main">Experimental and theoretical analysis of freight wagon link suspension</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Andersson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">220</biblScope>
+ <biblScope unit="issue">4</biblScope>
+ <biblScope unit="page" from="361" to="372" />
+ <date type="published" when="2006-01" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b41">
+ <analytic>
+ <title level="a" type="main">Influence of link suspension characteristics variation on two-axle freight wagon dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Andersson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">NVSD</title>
+ <imprint>
+ <biblScope unit="volume">44</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="415" to="423" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b42">
+ <monogr>
+ <title level="m" type="main">New simulation model for freight wagons with UIC link suspension, VSD</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Persson</forename><forename type="middle">I</forename></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="page" from="695" to="704" />
+ </imprint>
+ </monogr>
+ <note>Suppl. 1</note>
+</biblStruct>
+
+<biblStruct xml:id="b43">
+ <monogr>
+ <title level="m" type="main">Dynamics of European two-axle freight wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hoffmann</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <pubPlace>Kongens Lyngby, Denmark</pubPlace>
+ </imprint>
+ <respStmt>
+ <orgName>Technical University of Denmark</orgName>
+ </respStmt>
+ </monogr>
+<note type="report_type">Ph.D. Thesis</note>
+</biblStruct>
+
+<biblStruct xml:id="b44">
+ <monogr>
+ <title level="m" type="main">Freight wagon running gears with leaf spring and ring suspension, presented at the SIMPACK user group meeting</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Stiepel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Zeipel</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b45">
+ <analytic>
+ <title level="a" type="main">Simulation of the Response of Leaf Springs to Broad Band Random Excitation</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Cebon</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">15</biblScope>
+ <biblScope unit="issue">6</biblScope>
+ <biblScope unit="page" from="375" to="390" />
+ <date type="published" when="1986" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b46">
+ <monogr>
+ <title level="m" type="main">Measurement and Representation of the Mechanical Properties of Truck Leaf Springs</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">S</forename><surname>Fancher</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">D</forename><surname>Ervin</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><forename type="middle">C</forename><surname>Macadam</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Winkler</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1980-08" />
+ <publisher>SAE International</publisher>
+ <pubPlace>Warrendale, PA</pubPlace>
+ </imprint>
+ </monogr>
+ <note>SAE Technical Paper 800905</note>
+</biblStruct>
+
+<biblStruct xml:id="b47">
+ <analytic>
+ <title level="a" type="main">On Application of the Rolling Contact Theory for Modelling of the UIC Link Suspension for Freight Wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Zeszyty Naukowe Instytutu Pojazdów</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="issue">50</biblScope>
+ <biblScope unit="page" from="5" to="14" />
+ <date type="published" when="2003" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b48">
+ <analytic>
+ <title level="a" type="main">A new mathematical model of the behaviour of a four-axle freight wagon with UIC single-link suspension</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Matei</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1177/0954409711398173</idno>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">225</biblScope>
+ <biblScope unit="page">637</biblScope>
+ <date type="published" when="2011" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b49">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><forename type="middle">V</forename><surname>Vershinsky</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><forename type="middle">N</forename><surname>Danilov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><forename type="middle">N</forename><surname>Chelnokov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">I</forename><forename type="middle">I</forename></persName>
+ </author>
+ <title level="m">Wagon dynamics. Ðœoscow, Transport</title>
+ <imprint>
+ <date type="published" when="1972" />
+ <biblScope unit="volume">304</biblScope>
+ </imprint>
+ </monogr>
+ <note>in Russian</note>
+</biblStruct>
+
+<biblStruct xml:id="b50">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><surname>Ballew</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><forename type="middle">J</forename><surname>Chan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Sandu</surname></persName>
+ </author>
+ <title level="m">Multibody dynamics modelling of the freight train bogie system Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">49</biblScope>
+ <biblScope unit="page" from="2011" to="501" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b51">
+ <analytic>
+ <title level="a" type="main">Modelling friction wedges, Part II: An improved model. Proceedings of IMECE04</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">E</forename><surname>Klauser</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">ASME International Mechanical Engineering Congress &amp; Exposition</title>
+ <imprint>
+ <date type="published" when="2004-11-13" />
+ <publisher>American Society of Mechanical Engineering</publisher>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b52">
+ <analytic>
+ <title level="a" type="main">Freight car models and their computer-aided dynamic analysis</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Kovalev</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">N</forename><surname>Lysikov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">G</forename><surname>Mikheev</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Pogorelov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><surname>Simonov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><surname>Yazykov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Zakharov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">I</forename><surname>Zharov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">I</forename><surname>Goryacheva</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Soshenkov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Torskaya</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Multibody System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="issue">4</biblScope>
+ <biblScope unit="page" from="399" to="423" />
+ <date type="published" when="2009" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b53">
+ <analytic>
+ <title level="a" type="main">On Calculation of Jacobian Matrices in Simulation of Multibody Systems</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Pogorelov</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Preprints of the NATO Advanced Study Institute on Virtual Nonlinear Multibody Systems</title>
+ <editor>Schiehlen and Valasek</editor>
+ <meeting><address><addrLine>Prague</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2002" />
+ <biblScope unit="page" from="159" to="164" />
+ </imprint>
+ <respStmt>
+ <orgName>Czech Technical University in Prague</orgName>
+ </respStmt>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b54">
+ <analytic>
+ <title level="a" type="main">Possibility of jamming and wedging in the three-piece trucks of a moving freight car</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">D</forename><surname>Mckisic</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><forename type="middle">F</forename><surname>Ushkalov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Zhechev</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">45</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="61" to="67" />
+ <date type="published" when="2007" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b55">
+ <analytic>
+ <title level="a" type="main">Dynamic models of friction wedge dampers</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">P</forename><surname>Cusumano</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">F</forename><surname>Gardner</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 1997 IEEE/ASME Joint Rail Conference</title>
+ <meeting>the 1997 IEEE/ASME Joint Rail Conference<address><addrLine>Boston, MA</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1920-03-18" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b56">
+ <analytic>
+ <title level="a" type="main">Modelling Freight Wagon Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Mcclanachan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Y</forename><surname>Handoko</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Dhanasekar</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Skerman</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Davey</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics Supplement</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="438" to="447" />
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b57">
+ <monogr>
+ <title level="m" type="main">Modeling and dynamics of friction wedge dampers in railroad freight trucks Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">B</forename><surname>Kaiser</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">P</forename><surname>Cusumano</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">F</forename><surname>Gardner</surname></persName>
+ </author>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="2002" to="55" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b58">
+ <analytic>
+ <title level="a" type="main">Chaos in a railway bogie</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><forename type="middle">H</forename><surname>Kaas-Petersen</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Acta Mechanica</title>
+ <imprint>
+ <biblScope unit="volume">61</biblScope>
+ <biblScope unit="page" from="89" to="107" />
+ <date type="published" when="1986" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b59">
+ <analytic>
+ <title level="a" type="main">On non-linear methods of bogie stability assessment using computer simulations</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">O</forename><surname>Polach</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">220</biblScope>
+ <biblScope unit="page" from="13" to="27" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b60">
+ <analytic>
+ <title level="a" type="main">Limit cycle behaviour and chaotic motions of two-axle freight wagons with friction damping</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Multibody System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">8</biblScope>
+ <biblScope unit="issue">3</biblScope>
+ <biblScope unit="page" from="243" to="255" />
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b61">
+ <analytic>
+ <title level="a" type="main">Dynamics of two-axle railway freight wagons with UIC standard suspension, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hoffmann</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>True</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423110600869594</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">44</biblScope>
+ <biblScope unit="page" from="1" to="139" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b62">
+ <analytic>
+ <title level="a" type="main">Analysis of the nonlinear dynamics of a 2axle freight wagon in curves, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Di</forename><surname>Gialleonardo</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Bruni</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>True</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423114.2013.863363</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">52</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="125" to="141" />
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b63">
+ <analytic>
+ <title level="a" type="main">Lateral hunting stability of railway vehicles running on elastic track structures</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><forename type="middle">M</forename><surname>Zhai</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">K</forename><forename type="middle">Y</forename><surname>Wang</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Journal of Computational and Nonlinear Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="issue">4</biblScope>
+ <biblScope unit="page" from="41009" to="41010" />
+ <date type="published" when="2010" />
+ </imprint>
+ </monogr>
+ <note>ASME</note>
+</biblStruct>
+
+<biblStruct xml:id="b64">
+ <analytic>
+ <title level="a" type="main">Freight car curving performance in braked conditions</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Berghuvud</surname></persName>
+ </author>
+ <idno type="doi">23.DOI:10.1243/0954409021531656</idno>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers, Part F: Journal of Rail and Rapid Transit</title>
+ <imprint>
+ <biblScope unit="volume">216</biblScope>
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b65">
+ <monogr>
+ <title/>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><surname>Hecht</surname></persName>
+ </author>
+ <imprint/>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b66">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Keudel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename></persName>
+ </author>
+ <title level="m">Verbesserte Energieeffizienz durch radialeinstellendes Fahrwerk, Eisenbahningenieur 05</title>
+ <imprint>
+ <date type="published" when="2006" />
+ <biblScope unit="page" from="42" to="47" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b67">
+ <analytic>
+ <title level="a" type="main">Wheel/rail interface management in heavy haul railway operations-applying science and technology, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">D</forename><surname>Fröhling</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423110701413797</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">45</biblScope>
+ <biblScope unit="issue">7-8</biblScope>
+ <biblScope unit="page" from="649" to="677" />
+ <date type="published" when="2007" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b68">
+ <analytic>
+ <title level="a" type="main">Minimising wheel wear by optimising the primary suspension stiffness and centre plate friction of selfsteering bogies, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><forename type="middle">N</forename><surname>Fergusson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">D</forename><surname>Fröhling</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Klopper</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423110801993094</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="issue">S1</biblScope>
+ <biblScope unit="page" from="457" to="468" />
+ <date type="published" when="2008" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b69">
+ <analytic>
+ <title level="a" type="main">Influence of switches and crossings on wheel profile evolution in freight vehicles. Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Casanueva</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Doulgerakis</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423114.2014.898779</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">52</biblScope>
+ <biblScope unit="page" from="317" to="337" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b70">
+ <analytic>
+ <title level="a" type="main">A parametric study of the effects of freight vehicles on rolling contact fatigue of rail</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Tunna</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Urban</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1243/09544097JRRT228</idno>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">223</biblScope>
+ <biblScope unit="page">141</biblScope>
+ <date type="published" when="2009" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b71">
+ <analytic>
+ <title level="a" type="main">Whole life rail model application and development for RSSBdevelopment of an RCF damage parameter</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Burstow</surname></persName>
+ </author>
+ <idno>AEATR-ES-2003-832</idno>
+ <ptr target="http://www.rssb.co.uk" />
+ </analytic>
+ <monogr>
+ <title level="j">Rail Safety &amp; Standards Board</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <date type="published" when="2003-10" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b72">
+ <analytic>
+ <title level="a" type="main">Wheel damage on the Swedish iron ore line investigated via multibody simulation</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Hossein</forename><surname>Nia</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename></persName>
+ </author>
+ <idno type="doi">228:652.DOI:10.1177/0954409714523264</idno>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the Institution of Mechanical Engineers</title>
+ <meeting>the Institution of Mechanical Engineers</meeting>
+ <imprint>
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b73">
+ <analytic>
+ <title level="a" type="main">The dynamic effects of conventional freight car running over a dipped joint</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">V</forename><surname>Dukkipati</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Dong</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">31</biblScope>
+ <biblScope unit="page" from="95" to="111" />
+ <date type="published" when="1999" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b74">
+ <monogr>
+ <title level="m" type="main">A test rig for measuring three piece bogie dynamic parameters applied to freight car application Vehicle System Dynamics 44 supplement</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">L</forename><forename type="middle">H</forename><surname>Ren</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">G</forename><surname>Shen</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Y</forename><forename type="middle">S</forename><surname>Hu</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <biblScope unit="page" from="853" to="861" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b75">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">H</forename><surname>Wickens</surname></persName>
+ </author>
+ <title level="m">The dynamics of railway vehicles on straight track-fundamental considerations of lateral stability Proceedings of the Institution of Mechanical Engineers Part</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="3" to="1965" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b76">
+ <monogr>
+ <title level="m" type="main">Suspension design for high performance two-axle freight vehicles Proceedings of the Institution of Mechanical Engineers Part</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">H</forename><surname>Wickens</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">O</forename><surname>Gilchrist</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">E W</forename><surname>Hobbs</surname></persName>
+ </author>
+ <idno>3D 1969-70</idno>
+ <imprint>
+ <biblScope unit="volume">184</biblScope>
+ <biblScope unit="page" from="22" to="36" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b77">
+ <monogr>
+ <title level="m" type="main">Tracking truck</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><forename type="middle">B</forename><surname>Webber</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1968-07" />
+ <biblScope unit="volume">339466230</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b78">
+ <monogr>
+ <title level="m" type="main">Uklad zawieszenia pojazdu kolejowego, zwlaszcza dwuosiowego wagonu towarowego</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Greenbrier</forename><surname>Europe</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Wagony</forename><surname>Swidnica</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><forename type="middle">A</forename></persName>
+ </author>
+ <imprint/>
+ </monogr>
+ <note>PL 207920 B1 B61F 5/30 (2006.01</note>
+</biblStruct>
+
+<biblStruct xml:id="b79">
+ <monogr>
+ <title level="m" type="main">Advances in Rail Wagon Design&apos; Proceedings of the Institution of Mechanical Engineers, Part F: Journal of Rail and Rapid Transit</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Etwell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990-01" />
+ <biblScope unit="volume">204</biblScope>
+ <biblScope unit="page" from="45" to="54" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b80">
+ <monogr>
+ <title level="m" type="main">Association of American Railroads. Manual of standards and recommended practices. Section C-part II. Design, fabrication, and construction of freight cars</title>
+ <imprint/>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b81">
+ <monogr>
+ <title level="m" type="main">Chapter 11: Service worthiness tests and analyses for new freight cars</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M-1001</forename></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2007" />
+ <biblScope unit="volume">374</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b82">
+ <monogr>
+ <title level="m" type="main">Association of American Railroads. Manual of standards and recommended practices. Section D. Trucks and truck details Specification M-976 Truck performance for rail cars</title>
+ <imprint>
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b83">
+ <analytic>
+ <title level="a" type="main">Comparison of different types of friction wedge suspensions in freight wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Rudakova</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 8-th International Conference on Railway Bogies and Running Gears</title>
+ <meeting>the 8-th International Conference on Railway Bogies and Running Gears<address><addrLine>Budapest: BUTE</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2010" />
+ <biblScope unit="page" from="41" to="50" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b84">
+ <monogr>
+ <title level="m" type="main">Influence of bogie to car body connection parameters on stability and curving of freight vehicle // Extended abstracts 6th international conference Railway bogies and running gears</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Boronenko</forename><forename type="middle">P</forename><surname>Yu</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">M</forename><surname>Orlova</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2004-09" />
+ <biblScope unit="page" from="23" to="25" />
+ <pubPlace>Budapest: BUTE</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b85">
+ <monogr>
+ <title level="m" type="main">Shear Stiffner Linkages for Radial Bogies</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Scheffel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">H</forename><surname>Smit</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1997" />
+ <biblScope unit="volume">27</biblScope>
+ </imprint>
+ </monogr>
+ <note>Supplement to Vehicle System Dynamics</note>
+</biblStruct>
+
+<biblStruct xml:id="b86">
+ <analytic>
+ <title level="a" type="main">The influence of inter-axle linkages on stability and guidance of freight bogies</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Boronenko</forename><forename type="middle">P</forename><surname>Yu</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 8-th mini conference on vehicle system dynamics, identification and anomalies</title>
+ <meeting>the 8-th mini conference on vehicle system dynamics, identification and anomalies<address><addrLine>Budapest: BUTE</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2002" />
+ <biblScope unit="page" from="175" to="182" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b87">
+ <analytic>
+ <title level="a" type="main">Suspension of freight wagon bogiewith the Lenoir friction damper ensuring low wear of wheels and good lateral dynamics of the wagon</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><surname>Pazdzierniak</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">T</forename><surname>Adamczewski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proc. of XVIII conference &apos;Pojazdy Szynow</title>
+ <meeting>of XVIII conference &apos;Pojazdy Szynow</meeting>
+ <imprint>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">I</biblScope>
+ <biblScope unit="page" from="199" to="211" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b88">
+ <analytic>
+ <title level="a" type="main">Wear and energy-saving freight bogie designs with rubber primary springs: principles and experiences</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JRRT</title>
+ <imprint>
+ <biblScope unit="volume">227</biblScope>
+ <biblScope unit="page" from="105" to="110" />
+ <date type="published" when="2009" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b89">
+ <analytic>
+ <title level="a" type="main">Innovative Freight Wagons-A Precondition to increase the MarketShare of Rail Freight</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Archives of Transport</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="17" to="26" />
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+ <note>Polish Academy of Sciences, committee of Transport</note>
+</biblStruct>
+
+<biblStruct xml:id="b90">
+ <monogr>
+ <title level="m" type="main">A new design approach for railway vehicle suspension; Rail International</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Scheffel</surname></persName>
+ </author>
+ <idno>1974. - -â„–10.-P. 638-651</idno>
+ <imprint/>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b91">
+ <monogr>
+ <title level="m" type="main">Project INFRA-RADIAL-bogies for axle loads of 25 t-test and simulation&apos;; XXI Century Rolling Stock: Ideas, Requirements, Projects Conference</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><surname>Kik</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Scholdan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Stephanides</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2007" />
+ <pubPlace>St. Petersburg</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b92">
+ <monogr>
+ <title level="m" type="main">Simulation of longitudinal dynamics of long freight trains in positioning operations Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Z</forename><surname>Qi</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Z</forename><surname>Huang</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">X</forename><surname>Kong</surname></persName>
+ </author>
+ <imprint>
+ <biblScope unit="volume">50</biblScope>
+ <biblScope unit="page" from="2012" to="1409" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b93">
+ <monogr>
+ <title level="m" type="main">Numerical and experimental approach for the evaluation of severe longitudinal dynamics of heavy freight trains Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><surname>Belforte</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">F</forename><surname>Celi</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">G</forename><surname>Diana</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Melzi</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="page" from="937" to="955" />
+ </imprint>
+ </monogr>
+ <note>Supplement</note>
+</biblStruct>
+
+<biblStruct xml:id="b94">
+ <monogr>
+ <title level="m" type="main">Longitudinal train dynamics&apos; in Handbook of Railway Vehicle Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Cole</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1999" />
+ </imprint>
+ </monogr>
+ <note>Iwnicki ed Taylor and Francis</note>
+</biblStruct>
+
+<biblStruct xml:id="b95">
+ <analytic>
+ <title level="a" type="main">An investigation of the effect of bogie and wagon pitch associated with longitudinal train dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Mcclanachan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Cole</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Roach</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><surname>Scown</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The dynamics of vehicles on roads and on tracks Vehicle System Dynamics Supplement Swets &amp; Zeitlinger Amsterdam</title>
+ <imprint>
+ <date type="published" when="1999" />
+ <biblScope unit="page" from="374" to="385" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b96">
+ <analytic>
+ <title level="a" type="main">Recent advancements in bluff and draft testing techniques</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>El-Sibaie</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Fifth International Heavy Haul Conference</title>
+ <meeting><address><addrLine>Beijing</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1993" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b97">
+ <analytic>
+ <title level="a" type="main">White Paper Innovative Rail Freight Wagon 2030</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Koenig</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ <ptr target="http://www.schienenfzg.tu-Berlin.de/fileadmin/fg62/Dokumente/Downloads/White_Paper_Innovative_Rail_Freight_Wagon_2030.pdf" />
+ </analytic>
+ <monogr>
+ <title level="j">TU Dresden TU Berlin</title>
+ <imprint>
+ <date type="published" when="2012" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b98">
+ <analytic>
+ <title level="a" type="main">On the Theory of Nonlinear Dynamics and its Applications in Vehicle Systems Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>True</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">31</biblScope>
+ <biblScope unit="issue">5-6</biblScope>
+ <biblScope unit="page" from="393" to="421" />
+ <date type="published" when="1999" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b99">
+ <analytic>
+ <title level="a" type="main">Numerical simulation of wheel wear evolution for heavy haul railway</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><surname>Wang</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">L</forename><surname>Gao</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1007/s11771-015-2510-1</idno>
+ </analytic>
+ <monogr>
+ <title level="j">J. Cent. South Univ</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="196" to="207" />
+ <date type="published" when="2015" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python_hadoop/tests/files/example.cdx b/python_hadoop/tests/files/example.cdx
new file mode 100644
index 0000000..84e3271
--- /dev/null
+++ b/python_hadoop/tests/files/example.cdx
@@ -0,0 +1,20 @@
+edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.pdf 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pdf 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pdf application/pdf 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,ifaamas)/proceedings/aamas09/pdf/01_full%20papers/02_08_fp_0272.pdf 20170706081902 http://www.ifaamas.org/Proceedings/aamas09/pdf/01_Full%20Papers/02_08_FP_0272.pdf application/pdf 200 GYHX35QJWRJELWJ5GDQZPTPOUUZOCTKF - - 251180 34635154 CITESEERX-CRAWL-2017-06-20-20170706081825105-00419-00428-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706081838210-00420-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+de,fau,cs)/publications/2014/lukas_14_masterthesis.pdf 20170705101722 http://www4.cs.fau.de/Publications/2014/lukas_14_masterthesis.pdf application/pdf 200 GIUQT7SXZ33TWEFBM2MWURJI2M3QE3IW - - 1290532 71068435 CITESEERX-CRAWL-2017-06-20-20170705101605019-00279-00288-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705101714659-00281-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+de,bund,jki,pub)/index.php/jabfq/article/download/3568/4462 20170706041152 http://pub.jki.bund.de/index.php/JABFQ/article/download/3568/4462/ text/html 301 XZBNO24W2ZPQQMJYE6YUUCSRUF7G3ZBT - - 552 417292708 CITESEERX-CRAWL-2017-06-20-20170706040506112-00160-00169-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706041021844-00165-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+whois://whois.arin.net/z+%2B+132.177.133.114 20170713120653 whois://whois.arin.net/z+%2B+132.177.133.114 text/plain - IDEID4YQ6MVJSOE57NPVDLL53ZB3J4DX - - 876 30983517 CITESEERX-CRAWL-2017-06-20-20170707064626094-01007-01015-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170711214025652-01014-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+za,co,csir,researchspace)/dspace/bitstream/10204/4048/1/smith2_2010.pdf 20170706094159 http://researchspace.csir.co.za/dspace/bitstream/10204/4048/1/Smith2_2010.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 104830407 CITESEERX-CRAWL-2017-06-20-20170706093829986-00509-00518-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706094137978-00512-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,annals)/article.aspx?articleid=705034 20170707013120 http://annals.org/article.aspx?articleid=705034 text/html 301 QQYKL57QSERLFM3LXSWMNOFXMOCN7C5G - - 22665 28113974 CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-00976-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,annals)/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf 20170707045304 http://annals.org/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf text/html 302 423S7EMGLCVIZ3FLVD7TLAG75HWE4RGI - - 644 222908628 CITESEERX-CRAWL-2017-06-20-20170707042504366-00997-01006-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707045044604-00999-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+com,sagepub,spi)/content/28/4/501.full.pdf 20170705092027 http://spi.sagepub.com/content/28/4/501.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 396 553180242 CITESEERX-CRAWL-2017-06-20-20170705091311851-00219-00228-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705091759818-00223-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+ir,mediaj)/favicon.ico 20170705075240 http://mediaj.ir/favicon.ico text/html 404 E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK - - 589 455827180 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705075051100-00135-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+com,sagepub,jpr)/content/8/3-4/239.full.pdf 20170705074931 http://jpr.sagepub.com/content/8/3-4/239.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 400 270368088 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+jp,co,nittuden)/business/pdf/transparent_thermoplastic_resin_with_electron_beam_cross-linking.pdf 20170706083459 http://www.nittuden.co.jp/business/pdf/Transparent_Thermoplastic_Resin_with_Electron_Beam_Cross-Linking.pdf application/pdf 200 V32E3CCO7NMI2M4OHLKG73DXD72LR4B2 - - 715081 761088410 CITESEERX-CRAWL-2017-06-20-20170706082646066-00429-00438-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706083257353-00436-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+lt,lms)/robots.txt 20170705122708 http://www.lms.lt/robots.txt text/plain 200 PF3HTQQT2ULYRWFLJGUWZKHTVZUVMZ2F - - 592 668333707 CITESEERX-CRAWL-2017-06-20-20170705121748408-00399-00408-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705122352502-00406-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+hu,bme,phy)/~szalay/pub/multipartcriteriaposter.pdf 20170705124828 http://www.phy.bme.hu/%7Eszalay/pub/multipartcriteriaPoster.pdf application/pdf 200 L3TUEEZLBJTHAVH74B5N426FAIDBCCOE - - 187866 964760782 CITESEERX-CRAWL-2017-06-20-20170705123641979-00419-00428-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705124315591-00426-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,adb,openaccess)/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf;jsessionid=f966a3bdac9882ec5a7c326b130f6f81?sequence=1 20170705090940 https://openaccess.adb.org/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf%3Bjsessionid%3DF966A3BDAC9882EC5A7C326B130F6F81?sequence%3D1 unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 515 634039376 CITESEERX-CRAWL-2017-06-20-20170705090333400-00209-00218-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705090728803-00212-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,physiology,ajpregu)/content/272/4/r1084 20170706131006 http://ajpregu.physiology.org/content/272/4/R1084 text/html 200 3FOQSKT4WBYOUA6VKKJCEQCN6QF35ANT - - 27346 336293585 CITESEERX-CRAWL-2017-06-20-20170706130432396-00707-00716-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706130850866-00711-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+de,desy,www-it)/common/documentation/cd-docs/sc2002/paperpdf/pap234.pdf 20170705121813 http://www-it.desy.de/common/documentation/cd-docs/SC2002/paperpdf/pap234.pdf application/pdf 200 BONCZ4NNGRNYR22ASFVU7VYTQ24RRNP4 - - 72421 381715704 CITESEERX-CRAWL-2017-06-20-20170705120827801-00389-00398-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705121708700-00397-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,oxfordjournals,bmb)/content/28/3/247.full.pdf 20170706014948 http://bmb.oxfordjournals.org/content/28/3/247.full.pdf text/html 301 EJWYVOPONJRARK7SGG6COFRN7CSTHROY - - 643 119398161 CITESEERX-CRAWL-2017-06-20-20170706014800946-00020-00029-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706014907678-00022-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
diff --git a/python_hadoop/tests/files/example_grobid_metadata.json b/python_hadoop/tests/files/example_grobid_metadata.json
new file mode 100644
index 0000000..a2d18db
--- /dev/null
+++ b/python_hadoop/tests/files/example_grobid_metadata.json
@@ -0,0 +1,5 @@
+{"abstract": "In this paper an analytical model is presented for the Micro-Cantilever (MC) of Atomic Force Microscopy with Side Wall probe (AFM-SW) in the tapping excitation mode. In this model the couple motion of the MC is taken into account while the torsional motion is considered as an undesirable motion which is coupled with the vertical motion. To this end, the effect of several parameters, namely; probe mass, probe dislocation, sidewall extension length, and tip sample interaction force is investigated on the occurrence probability of torsional and vertical motions. It is found that the probe dislocation is the prerequisite factor of the undesired motion happening. For sake of validation, the analytical results are compared against the previously published results, and an excellent agreement is observed. Abstrak Dalam kertas ini, model analitikal dipersembahkan bagi micro-julur Mikroskop Daya Atom dengan prob dinding-sisi dan dalam mod pengujaan menoreh. Dalam model ini, gerakan pasangan bagi mikro-julur diambil kira manakala gerakan kilasan dianggap sebagai gerakan yang tidak diingini yang digandingkan dengan pergerakan menegak. Untuk tujuan ini , kesan daripada beberapa parameter, iaitu; jisim prob, kehelan prob, panjang lanjutan sisi, dan daya interaksi di antara tip dan sampel disiasat keatas kebarangkalian berlakunya gerakan kilasan dan menegak. Didapati bahawa kehelan prob adalah faktor prasyarat berlakunya gerakan yang tidak diingini. Untuk pengesahan, keputusan analisis ini dibandingkan dengan keputusan yang sebelum ini telah diterbitkan, dan didapati persetujuannya sangat baik. Kata kunci: Mokroskop daya atom, prob dind ing sisi, micro-jalur, getaran, gerakan pasangan", "acknowledgement": "Acknowledgement We are grateful for the UTM scholarship to Author 1. Authors gratefully acknowledge t he Research Institute of Petroleum Industry (RIPI) and the Iran Nanotechnology Laboratory Network (INLN) for their support.", "authors": [{"name": "Farzad Mokhtarinezhad"}, {"name": "Roslan Rahman"}, {"name": "Sina Eftekhar"}, {"name": "Sadegh Hassani"}], "citations": [{"authors": [{"name": "Julie Last"}, {"name": "Paul Russell"}, {"name": "P aul Nealey"}, {"name": "Christopher Murphy"}], "date": "2010", "id": "b0", "index": 0, "issue": null, "journal": "Investigative Ophthalmology & Visual Science", "publisher" : null, "title": "The applications of atomic force microscopy to vision science", "url": null, "volume": "51"}, {"authors": [{"name": "G Binnig"}, {"name": "C Quate"}, {"name": "C Geber"}], "date": "1986", "id": "b1", "index": 1, "issue": null, "journal": "Phys Rev Let", "publisher": null, "title": "Atomic force microscope", "url": null, "vol ume": "56"}, {"authors": [{"name": "C Wright"}, {"name": "Armstrong"}], "date": "2006", "id": "b2", "index": 2, "issue": null, "journal": "Surf Interface Anal", "publisher" : null, "title": "The application of atomic force microscopy force measurements to the characterisation of microbial surfaces", "url": null, "volume": "38"}, {"authors": [{ "name": "John Withers"}, {"name": "D Aston"}], "date": "2006", "id": "b3", "index": 3, "issue": null, "journal": "Advances in Colloid and Interface Science", "publisher": null, "title": "Nanomechanical measurements with AFM in the elastic limit", "url": null, "volume": "120"}, {"authors": [{"name": "Dara Bayat"}, {"name": "Terunobu Akiyama"}, {"name": "F Nicolaas"}, {"name": "Urs De Rooij"}, {"name": "Staufer"}], "date": "2008", "id": "b4", "index": 4, "issue": null, "journal": "Microelectronic Engineering", "p ublisher": null, "title": "Dynamic behavior of the tuning fork AFM probe", "url": null, "volume": "85"}, {"authors": [{"name": "M Kahrobaiyan"}, {"name": "M Ahmadian"}, {"name": "P Haghighi"}, {"name": "A Haghighi"}], "date": "2010", "id": "b5", "index": 5, "issue": null, "journal": "International Journal of Mechanical Sciences", "publisher": null, "title": "Sensitivity and resonant frequency of an AFM with sidewall and top-surface probes for both flexural and torsional modes", "url": null, "volume": "52"}, {"a uthors": [{"name": "Gaoliang Dai"}, {"name": "Helmut Wolff"}, {"name": "Frank Pohlenz"}, {"name": "Hans-Ulrich Danzebrink"}, {"name": "G5Cu00fcnter Wilkening"}], "date": "2006", "id": "b6", "index": 6, "issue": null, "journal": "APPLIED PHYSICS LETTERS", "publisher": null, "title": "Atomic force probe for sidewall scanning of nano-and micro structures", "url": null, "volume": "88"}, {"authors": [{"name": "Gaoliang Dai"}, {"name": "Helmutwolff"}, {"name": "Min Thomasweimann"}, {"name": "Frank Xu"}, {"name": "Ha ns-Ulrich Pohlenz"}, {"name": "Danzebrink"}], "date": "2007", "id": "b7", "index": 7, "issue": null, "journal": "Meas. Sci. Technol", "publisher": null, "title": "Nanoscale surface measurements at sidewalls of nanoand micro-structures", "url": null, "volume": "18"}, {"authors": [{"name": "Win-Jin Chang"}, {"name": "Haw-Long Lee"}, {"name": "T erry Yuan-Fang Chen"}], "date": "2008", "id": "b8", "index": 8, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Study of the sensitivity of the fi rst four flexural modes of an AFM cantilever with a sidewall probe", "url": null, "volume": "108"}, {"authors": [{"name": "Xiaohui Tang"}, {"name": "Vincent Bayot"}, {"name": "Nicolas Reckinger"}, {"name": "Denis Flandre"}, {"name": "Jean-Pierre Raskin"}, {"name": "Emmanuel Dubois"}, {"name": "Bernard Nysten"}], "date": "2009", "id": "b9", "i ndex": 9, "issue": null, "journal": "IEEE Transactions on Nanotechnogoly", "publisher": null, "title": "A Simple Method for Measuring Si-Fin Sidewall Roughness by AFM", "ur l": null, "volume": "8"}, {"authors": [{"name": "Ali Hossein Nejat Pishkenari"}, {"name": "Meghdari"}], "date": "2011", "id": "b10", "index": 10, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Influence of the tip mass on the tip-sample interactions in TM-AFM", "url": null, "volume": "111"}, {"authors": [{"name": "S ohrab Eslami"}, {"name": "Naderjalili"}], "date": "2012", "id": "b11", "index": 11, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "A comprehensiv e modeling and vibration analysis of AFM microcantilevers subjected to nonlinear tip-sample interaction forces", "url": null, "volume": "117"}, {"authors": [{"name": "Yaxin Song"}, {"name": "Bharat Bhushan"}], "date": "2006", "id": "b12", "index": 12, "issue": null, "journal": "Journal of Applied Physics", "publisher": null, "title": "Couplin g of cantilever lateral bending and torsion in torsional resonance and lateral excitation modes of atomic force microscopy", "url": null, "volume": "99"}, {"authors": [{"name": "Haw-Long Lee"}, {"name": "Win-Jin Chang"}], "date": "2008", "id": "b13", "index": 13, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Couple d lateral bending-torsional vibration sensitivity of atomic force microscope cantilever", "url": null, "volume": "108"}, {"authors": [{"name": "Farzad Mokhtarinezhad"}], "d ate": "2015", "id": "b14", "index": 14, "issue": null, "journal": null, "publisher": null, "title": "Jurnal Teknologi (Sciences & Engineering)", "url": null, "volume": "76" }, {"authors": [{"name": "F Mokhtari-Nezhad"}, {"name": "A Saidi"}, {"name": "S Ziaei-Rad"}], "date": "2009", "id": "b15", "index": 15, "issue": null, "journal": "Ultramicr oscopy", "publisher": null, "title": "Influence of the tip mass and position on the AFM cantilever dynamics: Coupling between bending, torsion and flexural modes", "url": null, "volume": "109"}, {"authors": [{"name": "Arvind Raman"}, {"name": "John Melcher"}, {"name": "Ryan Tung"}], "date": "2008", "id": "b16", "index": 16, "issue": null, "jo urnal": "Nanotodays", "publisher": null, "title": "Cantilever dynamics in atomic force microscopy", "url": null, "volume": "3"}, {"authors": [{"name": "Nader Jalili"}, {"name": "Karthik Laxminarayana"}], "date": "2004", "id": "b17", "index": 17, "issue": null, "journal": "Mechatronic", "publisher": null, "title": "A review of atomic force mic roscopy imaging systems: application to molecular metrology and biological sciences", "url": null, "volume": "14"}, {"authors": [{"name": "B Derjaguin"}, {"name": "V Muller "}, {"name": "Y Toporov"}], "date": "1975", "id": "b18", "index": 18, "issue": null, "journal": "J. Colloid Interf. Sci", "publisher": null, "title": "Effect of contact def ormations on the adhesion of particles", "url": null, "volume": "53"}, {"authors": [{"name": "Yaxin Song"}, {"name": "Bharat Bhushan"}], "date": "2006", "id": "b19", "index ": 19, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Simulation of dynamic modes of atomic force microscopy using a 3D finite element model", "u rl": null, "volume": "106"}, {"authors": [{"name": "K Johnson"}, {"name": "K Kendall"}, {"name": "A Roberts"}], "date": "1971", "id": "b20", "index": 20, "issue": null, "jo urnal": "Proc. R. Soc. London Ser. A", "publisher": null, "title": "Surface energy and the contact of elastic solids", "url": null, "volume": "324"}, {"authors": [{"name": "D Gorman"}], "date": "1975", "id": "b21", "index": 21, "issue": null, "journal": null, "publisher": null, "title": "Free Vibration Analysis of Beams and Shafts", "url": null, "volume": null}, {"authors": [{"name": "M Mahdavi"}, {"name": "A Farshidianfar"}, {"name": "M Tahani"}, {"name": "S Mahdavi"}, {"name": "H Dalir"}], "date": "2008", "id ": "b22", "index": 22, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "A more comprehensive modeling of atomic force microscope cantilever", "url" : null, "volume": "109"}, {"authors": [{"name": "M Reinstadtler"}, {"name": "U Rabe"}, {"name": "V Scherer"}, {"name": "U Hartmann"}, {"name": "A Goldade"}, {"name": "B Bhu shan"}, {"name": "W Arnold"}], "date": "2003", "id": "b23", "index": 23, "issue": null, "journal": "Applied physics letters", "publisher": null, "title": "On the nanoscale measurement of friction using atomic-force microscope cantilever torsional resonances", "url": null, "volume": "82"}, {"authors": [{"name": "M Reinst5Cu00e4dtler"}, {"name": "T Kasai"}, {"name": "U Rabe"}, {"name": "B Bhushan"}, {"name": "W Arnold"}], "date": "2005", "id": "b24", "index": 24, "issue": null, "journal": "Journal of Physics D: Applied Physics", "publisher": null, "title": "Imaging and measurement of elasticity and friction using the TRmode", "url": null, "volume": "38"}], "date": "2015", "doi": null, "journal": {"eissn": null, "issn": null, "issue": null, "name": null, "publisher": null, "volume": "76"}, "title": "Jurnal Teknologi Full Paper INVESTIGATION OF TORSI ONAL DEFLECTION AS AN UNDESIRED MOTION IN ATOMIC FORCE MICROSCOPY WITH SIDEWALL PROBE"}
+{"abstract": "Eight months after triple valve replacement with Bjork-Shiley tilting disc valves a patient developed symptoms and signs suggesting malfunction of the prosthesis in the tricuspid position. This was confirmed by echocardiography and angiocardiography, and at operation the di sc of the prosthesis was found to be stuck half-open by fibrin and clot. A further 11 patients with the same type of prosthesis in the tricuspid position were then studied by phonocardiography and echocardiography. In one of these the prosthesis was found to be stuck and this was confirmed by angiocardiography and surgery. These 2 cases are r eported in detail and thefindings in the other 10 are discussed. The implications of this high incidence of malfunction of the Bj6rk-Shiley prosthesis in the tricuspid posi tion are considered. Echocardiography appears to be essential in the follow-up of such patients.", "acknowledgement": null, "authors": [{"name": "P Bourdillon"}, {"name": " G Sharratt"}], "citations": [{"authors": [{"name": "J Assad-Morell"}, {"name": "A Tajik"}, {"name": "M Anderson"}, {"name": "R Tancredi"}, {"name": "R Wallace"}, {"name": " E Giuliani"}], "date": "1974", "id": "b0", "index": 0, "issue": null, "journal": "Mayo Clinic Proceedings", "publisher": null, "title": "Malfunctioning tricuspid valve pros thesis", "url": null, "volume": "49"}, {"authors": [{"name": "R Bache"}, {"name": "A From"}, {"name": "A Castaneda"}, {"name": "C Jorgensen"}, {"name": "Wang"}, {"name": "Y "}], "date": "1972", "id": "b1", "index": 1, "issue": null, "journal": "Chest", "publisher": null, "title": "Late thrombotic obstruction of Starr-Edwards tricuspid valve pr osthesis", "url": null, "volume": null}, {"authors": [{"name": "I Belenkie"}, {"name": "M Carr"}, {"name": "R Schlant"}, {"name": "D Nutter"}, {"name": "P Symbas"}], "date" : "1973", "id": "b2", "index": 2, "issue": null, "journal": "American Heart,Journal", "publisher": null, "title": "Malfunction of a Cutter Smeloff mitral ball valve prosthe sis: diagnosis by phonocardiography and echocardiography", "url": null, "volume": "86"}, {"authors": [{"name": "J Douglas"}, {"name": "Williams"}, {"name": "G"}], "date": " 1974", "id": "b3", "index": 3, "issue": null, "journal": "Circulation", "publisher": null, "title": "Echocardiographic evaluation of the Bjork-Shiley prosthetic valve", "ur l": null, "volume": "50"}, {"authors": [{"name": "J Gimenez"}, {"name": "W Winters"}, {"name": "Jr"}, {"name": "J Davila"}, {"name": "J Connell"}, {"name": "K Klein"}], "da te": "1965", "id": "b4", "index": 4, "issue": null, "journal": "American Journal of the Medical Sciences", "publisher": null, "title": "Dynamics of the StarrEdwards ball va lve prosthesis: a cine-fluorographic and ultrasonic study in humans", "url": null, "volume": "250"}, {"authors": [{"name": "M Johnson"}, {"name": "J Holmes"}, {"name": "Pat on"}, {"name": "B"}], "date": "1973", "id": "b5", "index": 5, "issue": null, "journal": "Circulation", "publisher": null, "title": "Echocardiographic determination of mitra l disc valve excursion", "url": null, "volume": "47"}, {"authors": [{"name": "M Johnson"}, {"name": "B Paton"}, {"name": "J Holmes"}], "date": "1970", "id": "b6", "index": 6, "issue": null, "journal": "Circulation", "publisher": null, "title": "Ultrasonic evaluation of prosthetic valve motion", "url": null, "volume": null}, {"authors": [{"name": "H Miller"}, {"name": "D Gibson"}, {"name": "J Stephens"}], "date": "1973", "id": "b7", "index": 7, "issue": null, "journal": "British Heart Journal", "publisher": null , "title": "Role of echocardiography and phonocardiography in diagnosis of mitral paraprosthetic regurgitation with Starr-Edwards prostheses", "url": null, "volume": "35"}, {"authors": [{"name": "P Oliva"}, {"name": "M Johnson"}, {"name": "M Pomerantz"}, {"name": "Levene"}, {"name": "A"}], "date": "1973", "id": "b8", "index": 8, "issue": null , "journal": "American journal of Cardiology", "publisher": null, "title": "Dysfunction of the Beall mitral prosthesis and its detection by cinefluoroscopy and echocardiogr aphy", "url": null, "volume": null}, {"authors": [{"name": "J Pfeifer"}, {"name": "N Goldschlager"}, {"name": "T Sweatman"}, {"name": "F Gerbode"}, {"name": "A Selzer"}], " date": "1972", "id": "b9", "index": 9, "issue": null, "journal": "American J7ournal of Cardiology", "publisher": null, "title": "Malfunction of mitral ball valve prosthesis due to thrombus: report of 2 cases with notes on early clinical diagnosis", "url": null, "volume": "29"}, {"authors": [{"name": "H Samaan"}, {"name": "R Murali"}], "date": "1970", "id": "b10", "index": 10, "issue": null, "journal": "Thorax", "publisher": null, "title": "Acute tricuspid valve obstruction following the use of tricuspid ball va lve prosthesis", "url": null, "volume": null}, {"authors": [{"name": "S Suwansirikul"}, {"name": "E Glassman"}, {"name": "F Raia"}, {"name": "F Spencer"}], "date": "1974", "id": "b11", "index": 11, "issue": null, "journal": "American J'ournal of Cardiology", "publisher": null, "title": "Late thrombosis of Starr-Edwards tricuspid ball valve pr osthesis", "url": null, "volume": "34"}, {"authors": [{"name": "Vander"}, {"name": "J Veer"}, {"name": "Jr"}, {"name": "G Rhyneer"}, {"name": "R Hodam"}, {"name": "F Kloste r"}], "date": "1971", "id": "b12", "index": 12, "issue": null, "journal": "Circulation", "publisher": null, "title": "Obstruction of tricuspid ball-valve prostheses", "url" : null, "volume": null}, {"authors": [{"name": "W Winters"}, {"name": "Jr"}, {"name": "J Gimenez"}, {"name": "L Soloff"}], "date": "1967", "id": "b13", "index": 13, "issue" : null, "journal": "American journal of Cardiology", "publisher": null, "title": "Clinical application of ultrasound in the analysis of prosthetic ball valve function", "ur l": null, "volume": "19"}, {"authors": [{"name": "P D V Requests For Reprints To Dr"}, {"name": "Western Bourdillon"}, {"name": "Hospital"}, {"name": "Oakley Road"}], "date ": false, "id": "b14", "index": 14, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}], "date": "1976", "doi": null, "journal": {"eissn": null, "issn": null, "issue": null, "name": "British Heart Journal", "publisher": null, "volume": "38"}, "title": "Malfunction of Bjork-Shiley valve prosthesis in tricuspid positi on"}
+{"abstract": "The interference is the major factor disrupting the sending of information in wireless networks. To ge t better performance for these networks as well in the conventional case as in cooperative one, all the necessary ways must be used to eliminate network interference. This article deals with the concept of Physical Layer Network Coding (PLNC). It is a way to exploit the operation of Network Coding (NC) that occurs naturally in the superimpose d electromagnetic waves (EM). It is a simple physical effect when several EM waves meet in the same physical space, they are mixed together. This mixture of EM waves is a f orm of NC produced by nature. Hence, the situation will be reversed and the interference will be a beneficial way to help the relay when sending information. This paper foc uses on the Symbol Error Rate (SER) Analysis of PLNC in the case of 16QAM modulator. It will exploit in detail the concept of mapping (modulation/demodulation) and will dem onstrate its contribution compared to NC and Traditional Network (TN).", "acknowledgement": "Conclusion In this paper, we took a brief description of different cases of coo perative networks in the case of TWRC. We describe the Traditional cooperative Networks, then the Network Coding, and finally, the Physical Layer Network Coding. This one a llows us to reduce the number of phases from 4 to 2. Furthermore, this paper illustrates that in PLNC and for the case of 16QAM constellation, the SER is lower than in the standard modulation case. This is verified with the modulation/demodulation study done and for the In-phase and quadrature case of the modulator.", "authors": [{"name": "R Hajji"}, {"name": "N Hamdi"}], "citations": [{"authors": [{"name": "R Hajji"}, {"name": "N Hamdi"}], "date": "2012", "id": "b0", "index": 0, "issue": null, "journal": "IEEE Electrotechnical Conference (MELECON)", "publisher": null, "title": "Optimizing of Power Allocation for Two-Hop DF Relaying Systems", "url": null, "volume": null}, {"autho rs": [{"name": "J Proakis"}], "date": "1989", "id": "b1", "index": 1, "issue": null, "journal": null, "publisher": null, "title": "Digital Communication", "url": null, "vol ume": null}, {"authors": [{"name": "S Tian"}, {"name": "Li Yonghui"}, {"name": "B Vucetic"}], "date": "2011", "id": "b2", "index": 2, "issue": null, "journal": "IEEE ICC", "publisher": null, "title": "A Near Optimal Amplify and Forward Relaying in Two-Way Relay Networks", "url": null, "volume": null}, {"authors": [{"name": "S Zhang"}, {"name" : "S Liew"}, {"name": "P Lam"}], "date": "2006", "id": "b3", "index": 3, "issue": null, "journal": null, "publisher": null, "title": "Physical Layer Network Coding. ACM Mob iCom", "url": null, "volume": null}], "date": "2013", "doi": null, "journal": {"eissn": null, "issn": null, "issue": "3", "name": "AWERProcedia Information Technology & Com puter Science", "publisher": null, "volume": "03"}, "title": "SER Analysis of Two-Hop Physical Layer Network Coding with 16QAM Modulator, AWERProcedia Information Technolog y & Computer Science"}
+{"abstract": "Suffix trees are by far the most important data structure in stringology, with myriads of applications in fields like bioinformatics and information retrieval. Classical representations of suffix trees require O(n log n) bits of space, for a string of size n. This is consid erably more than the n log 2 5Cu03c3 bits needed for the string itself, where 5Cu03c3 is the alphabet size. The size of suffix trees has been a barrier to their wider a doption in practice. Recent compressed suffix tree representations require just the space of the compressed string plus 5Cu0398(n) extra bits. This is already spectacular , but still unsatisfactory when 5Cu03c3 is small as in DNA sequences. In this paper we introduce the first compressed suffix tree representation that breaks this linear-s pace barrier. Our representation requires sublinear extra space and supports a large set of navigational operations in logarithmic time. An essential ingredient of our repr esentation is the lowest common ancestor (LCA) query. We reveal important connections between LCA queries and suffix tree navigation.", "acknowledgement": null, "authors": [{"name": "Lu5Cu00eds Russo"}, {"name": "Gonzalo Navarro"}, {"name": "Arlindo Oliveira"}], "citations": [{"authors": [{"name": "A Apostolico"}], "date": "1985", "id": "b0 ", "index": 0, "issue": null, "journal": "Combinatorial Algorithms on Words. NATO ISI Series", "publisher": null, "title": "The myriad virtues of subword trees", "url": null, "volume": null}, {"authors": [{"name": "M Bender"}, {"name": "M Farach-Colton"}], "date": "2000", "id": "b1", "index": 1, "issue": null, "journal": "Proceedings of LATIN ", "publisher": null, "title": "The LCA problem revisited", "url": null, "volume": "1776"}, {"authors": [{"name": "M Bender"}, {"name": "M Farach-Colton"}], "date": "2004", "id": "b2", "index": 2, "issue": "1", "journal": "Theor. Comp. Sci", "publisher": null, "title": "The level ancestor problem simplified", "url": null, "volume": "321"}, {" authors": [{"name": "M Farach"}], "date": "1997", "id": "b3", "index": 3, "issue": null, "journal": "Proceedings of FOCS", "publisher": null, "title": "Optimal suffix tree construction with large alphabets", "url": null, "volume": null}, {"authors": [{"name": "P Ferragina"}, {"name": "G Manzini"}, {"name": "V M5Cu00e4kinen"}, {"name": "G Na varro"}], "date": "2007", "id": "b4", "index": 4, "issue": "2", "journal": "ACM Trans. Algor", "publisher": null, "title": "Compressed representations of sequences and full -text indexes", "url": null, "volume": "3"}, {"authors": [{"name": "J Fischer"}, {"name": "V Heun"}], "date": "2007", "id": "b5", "index": 5, "issue": null, "journal": "Pro ceedings of ESCAPE", "publisher": null, "title": "A new succinct representation of RMQ-information and improvements in the enhanced suffix array", "url": null, "volume": "4 614"}, {"authors": [{"name": "L Foschini"}, {"name": "R Grossi"}, {"name": "A Gupta"}, {"name": "J Vitter"}], "date": "2006", "id": "b6", "index": 6, "issue": "4", "journal ": "ACM Trans. Algor", "publisher": null, "title": "When indexing equals compression: Experiments with compressing suffix arrays and applications", "url": null, "volume": " 2"}, {"authors": [{"name": "R Geary"}, {"name": "R Raman"}, {"name": "V Raman"}], "date": "2004", "id": "b7", "index": 7, "issue": null, "journal": "Proceedings of SODA", " publisher": null, "title": "Succinct ordinal trees with level-ancestor queries", "url": null, "volume": null}, {"authors": [{"name": "R Giegerich"}, {"name": "S Kurtz"}, {"name": "J Stoye"}], "date": "2003", "id": "b8", "index": 8, "issue": "11", "journal": "Softw., Pract. Exper", "publisher": null, "title": "Efficient implementation of lazy suffix trees", "url": null, "volume": "33"}, {"authors": [{"name": "D Gusfield"}], "date": "1997", "id": "b9", "index": 9, "issue": null, "journal": null, "publisher": null , "title": "Algorithms on Strings, Trees and Sequences", "url": null, "volume": null}, {"authors": [{"name": "D Knuth"}, {"name": "J"}, {"name": "V Pratt"}], "date": "1977" , "id": "b10", "index": 10, "issue": "2", "journal": "SIAM J. Comput", "publisher": null, "title": "Fast pattern matching in strings", "url": null, "volume": "6"}, {"author s": [{"name": "S Lee"}, {"name": "K Park"}], "date": "2007", "id": "b11", "index": 11, "issue": null, "journal": "Proceedings of CPM", "publisher": null, "title": "Dynamic rank-select structures with applications to run-length encoded texts", "url": null, "volume": "4580"}, {"authors": [{"name": "V M5Cu00e4kinen"}, {"name": "G Navarro"}], " date": "2006", "id": "b12", "index": 12, "issue": null, "journal": "Proceedings of CPM", "publisher": null, "title": "Dynamic entropy-compressed sequences and full-text ind exes", "url": null, "volume": "4009"}, {"authors": [{"name": "U Manber"}, {"name": "E Myers"}], "date": "1993", "id": "b13", "index": 13, "issue": "5", "journal": "SIAM J. Comput", "publisher": null, "title": "Suffix arrays: A new method for on-line string searches", "url": null, "volume": "22"}, {"authors": [{"name": "G Manzini"}], "date": " 2001", "id": "b14", "index": 14, "issue": "3", "journal": "J. ACM", "publisher": null, "title": "An analysis of the Burrows-Wheeler transform", "url": null, "volume": "48"} , {"authors": [{"name": "E Mccreight"}], "date": "1976", "id": "b15", "index": 15, "issue": "2", "journal": "J. ACM", "publisher": null, "title": "A space-economical suffix tree construction algorithm", "url": null, "volume": "32"}, {"authors": [{"name": "G Navarro"}, {"name": "V M5Cu00e4kinen"}], "date": "2007", "id": "b16", "index": 16, " issue": "1", "journal": "ACM Comp. Surv", "publisher": null, "title": "Compressed full-text indexes", "url": null, "volume": "39"}, {"authors": [{"name": "R Raman"}, {"name": "V Raman"}, {"name": "S Rao"}], "date": "2002", "id": "b17", "index": 17, "issue": null, "journal": "Proceedings of SODA", "publisher": null, "title": "Succinct indexabl e dictionaries with applications to encoding k-ary trees and multisets", "url": null, "volume": null}, {"authors": [{"name": "L Russo"}, {"name": "A Oliveira"}], "date": "2 006", "id": "b18", "index": 18, "issue": null, "journal": "Proceedings of SPIRE", "publisher": null, "title": "A compressed self-index using a Ziv-Lempel dictionary", "url" : null, "volume": "4209"}, {"authors": [{"name": "K Sadakane"}], "date": "2003", "id": "b19", "index": 19, "issue": "2", "journal": "J. of Algorithms", "publisher": null, " title": "New text indexing functionalities of the compressed suffix arrays", "url": null, "volume": "48"}, {"authors": [{"name": "K Sadakane"}], "date": "2007", "id": "b20" , "index": 20, "issue": null, "journal": "Theo. Comp. Sys", "publisher": null, "title": "Compressed Suffix Trees with Full Functionality", "url": null, "volume": null}, {"a uthors": [{"name": "E Ukkonen"}], "date": "1995", "id": "b21", "index": 21, "issue": "3", "journal": "Algorithmica", "publisher": null, "title": "Construting suffix trees o n-line in linear time", "url": null, "volume": "14"}, {"authors": [{"name": "N V5Cu00e4lim5Cu00e4ki"}, {"name": "W Gerlach"}, {"name": "K Dixit"}, {"name": "V M5Cu00e 4kinen"}], "date": "2007", "id": "b22", "index": 22, "issue": null, "journal": "Proceedings of WEA", "publisher": null, "title": "Engineering a compressed suffix tree imple mentation", "url": null, "volume": "4525"}, {"authors": [{"name": "P Weiner"}], "date": "1973", "id": "b23", "index": 23, "issue": null, "journal": "Proceedings of IEEE Sym p. on Switching and Automata Theory", "publisher": null, "title": "Linear pattern matching algorithms", "url": null, "volume": null}], "date": false, "doi": null, "journal" : {"eissn": null, "issn": null, "issue": null, "name": null, "publisher": null, "volume": null}, "title": "Fully-Compressed Suffix Trees"}
+{"abstract": null, "acknowledgement": null, "authors": [{"name": "Carine Van Huls Van Taxis"}, {"name": "Sebastiaan Piers"}, {"name": "Marta De Riva Silva"}, {"name": "Olaf Dekkers"}, {"name": "Dani5Cu00ebl Pijnappels"}, {"name": "Martin Schalij"}, {"name": "Adrianus Wijnmaalen"}, {"name": "Katja Zeppenfeld"}], "citations": [{"authors": [{"name": "T Baman"}, {"name": "D Lange"}, {"name": "K Ilg"}, {"name": "S Gupta"}, {"name": "T Liu"}, {"name": "C Algui re"}, {"name": "W Armstrong"}, {"name": "E Good"}, {"name": "A Chugh"}, {"name": "K Jongnarangsin"}, {"name": "F Pelosi"}, {"name": "Jr Crawford"}, {"name": "T Ebinger"}, { "name": "M Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2010", "id": "b0", "index": 0, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Relationship between burden of premature ventricular complexes and left ventricular function", "url": null, "volume": "7"}, {"authors": [{"name": "M Yokoka wa"}, {"name": "H Kim"}, {"name": "E Good"}, {"name": "A Chugh"}, {"name": "F Pelosi"}, {"name": "Jr Alguire"}, {"name": "C Armstrong"}, {"name": "W Crawford"}, {"name": "T Jongnarangsin"}, {"name": "K Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2012", "id": "b1", "index": 1, "issue": null, "journal": "Heart Rh ythm", "publisher": null, "title": "Relation of symptoms and symptom duration to premature ventricular complex-induced cardiomyopathy", "url": null, "volume": "9"}, {"autho rs": [{"name": "M Yokokawa"}, {"name": "H Kim"}, {"name": "E Good"}, {"name": "T Crawford"}, {"name": "A Chugh"}, {"name": "F Pelosi"}, {"name": "Jr Jongnarangsin"}, {"name": "K Latchamsetty"}, {"name": "R Armstrong"}, {"name": "W Alguire"}, {"name": "C Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2012", "id": " b2", "index": 2, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Impact of QRS duration of frequent premature ventricular complexes on the developmen t of cardiomyopathy", "url": null, "volume": "9"}, {"authors": [{"name": "P Carballeira"}, {"name": "M Deyell"}, {"name": "D Frankel"}, {"name": "D Benhayon"}, {"name": "F Squara"}, {"name": "W Chik"}, {"name": "M Kohari"}, {"name": "R Deo"}, {"name": "F Marchlinski"}], "date": "2014", "id": "b3", "index": 3, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Ventricular premature depolarization QRS duration as a new marker of risk for the development of ventricular premature depolarization- induced cardiomyopathy", "url": null, "volume": "11"}, {"authors": [{"name": "E Aliot"}, {"name": "W Stevenson"}, {"name": "J Almendral-Garrote"}, {"name": "F Bogun"}, {"name": "C Calkins"}, {"name": "E Delacretaz"}, {"name": "B Della"}, {"name": "G Hindricks"}, {"name": "P Jais"}, {"name": "M Josephson"}, {"name": "J Kautzner"}, {"name": "G Kay"}, {"name": "K Kuck"}, {"name": "B Lerman"}, {"name": "F Marchlinski"}, {"name": "V Reddy"}, {"name": "M Schalij"}, {"name": "R Schilling"}, {"name": "K Soejima"}, {"name": "Wilber Ehra/"}], "date": false, "id": "b4", "index": 4, "issue": null, "journal": null, "publisher": null, "title": "HRS Expert Consensus on Catheter Ablation of Ventricular Arrhythmias: developed in a partnership with the European Heart Rhythm Association (EHRA), a Registered Branch of the European Society of Cardiology (ESC), and the Heart Rhythm Society", "url": null, "volume": null}, {"authors": [{"name": "B Ts"}, {"name": "S"}, {"name": "S Ilg"}, {"name": "K Gupta"}, {"name": "S Liu"}, {"name": "T T Ty Y Alguire"}, {"name": "C"}, {"name": "Ar Ar Rms M Tron On Ong"}, {"name": "W Good"}, {"name": "E Chugh"}, {"name": "A A"}, {"name": "A"}, {"name": "J J Jongnaran N Ngs G s Gsin In In K K K"}, {"name": "Pe Pe Pelo Lo Losi Si Si"}, {"name": "F"}, {"name": ","}, {"name": "Jr"}], "date": false, "id": "b5", "index": 5, "issue": null, "journal": null, "publisher": null, "title": "Cra ra rawf wf wfor o o d d d T, T, T, Ebi i in ng ger r M M M", "url": null, "volume": null}, {"authors": [{"name": "M"}, {"name": "M"}, {"name": "M Kim M M H H Hm"}, {"name": "M Goo Oo Od"}, {"name": "E Chugh G G A A A, Pe Pe Pelo Lo Losi S"}, {"name": "F Jr R R. ; W"}, {"name": "Crawford T ; Mo Mo Mora Ra Rady Dy Dy F F F"}, {"name": "Bo Bogu Gu Gun N"}, {"name": "F"}], "date": "2012", "id": "b6", "index": 6, "issue": null, "journal": null, "publisher": null, "title": "Rel el elat at atio io ion n n of of of s s sym ym ympt pt ptom om oms s s an an and d d sy sy symp mp mpto to tom m m du du dur r ration o o o p p pre re rema ma matu tu ture re re v v ve e ent nt ntri ri ricu cu cula la ar r r co co comp mp mple le ex x x-i ind nd duc uc uced", "url": null, "volume": "20"}, {"authors": [{"name": "Cardiology Col lege Of"}], "date": "2009", "id": "b7", "index": 7, "issue": null, "journal": null, "publisher": null, "title": "ACC) and the American Heart Association (AHA). Heart Rhythm ", "url": null, "volume": "6"}, {"authors": [{"name": "D Zipes"}, {"name": "A Camm"}, {"name": "M Borggrefe"}, {"name": "A Buxton"}, {"name": "B Chaitman"}, {"name": "M Fro mer"}, {"name": "G Gregoratos"}, {"name": "G Klein"}, {"name": "A Moss"}, {"name": "R Myerburg"}, {"name": "S Priori"}, {"name": "M Quinones"}, {"name": "D Roden"}, {"name" : "M Silka"}, {"name": "C Tracy"}, {"name": "S Smith"}, {"name": "Jr Jacobs"}, {"name": "A Adams"}, {"name": "C Antman"}, {"name": "E Anderson"}, {"name": "J Hunt"}, {"name": "S Halperin"}, {"name": "J Nishimura"}, {"name": "R Ornato"}, {"name": "J Page"}, {"name": "R Riegel"}, {"name": "B Priori"}, {"name": "S Blanc"}, {"name": "J Budaj"}, { "name": "A Camm"}, {"name": "A Dean"}, {"name": "V Deckers"}, {"name": "J Despres"}, {"name": "C Dickstein"}, {"name": "K Lekakis"}, {"name": "J Mcgregor"}, {"name": "K Met ra"}, {"name": "M Morais"}, {"name": "J Osterspey"}, {"name": "A Tamargo"}, {"name": "J Zamorano"}, {"name": "J"}], "date": "2006", "id": "b8", "index": 8, "issue": null, " journal": "J Am Coll Cardiol", "publisher": null, "title": "ACC/AHA/ESC 2006 guidelines for management of patients with ventricular arrhythmias and the prevention of sudden cardiac death: a report of the American College of Cardiology/American Heart Association Task Force and the European Society of Cardiology Committee for Practice Guideline s (Writing Committee to Develop Guidelines for Management of Patients With Ventricular Arrhythmias and the Prevention of Sudden Cardiac Death)", "url": null, "volume": "48" }, {"authors": [{"name": "Y Sekiguchi"}, {"name": "K Aonuma"}, {"name": "Y Yamauchi"}, {"name": "T Obayashi"}, {"name": "A Niwa"}, {"name": "H Hachiya"}, {"name": "A Takaha shi"}, {"name": "J Nitta"}, {"name": "Y Iesaka"}, {"name": "M Isobe"}], "date": "2005", "id": "b9", "index": 9, "issue": null, "journal": "J Cardiovasc Electrophysiol", "pu blisher": null, "title": "Chronic hemodynamic effects after radiofrequency catheter ablation of frequent monomorphic ventricular premature beats", "url": null, "volume": "1 6"}, {"authors": [{"name": "H Tada"}, {"name": "S Ito"}, {"name": "G Shinbo"}, {"name": "K Tadokoro"}, {"name": "I Ito"}, {"name": "T Hashimoto"}, {"name": "K Miyaji"}, {"name": "K Kaseno"}, {"name": "S Naito"}, {"name": "A Nogami"}, {"name": "S Oshima"}, {"name": "K Taniguchi"}], "date": "2006", "id": "b10", "index": 10, "issue": null, "jour nal": "Pacing Clin Electrophysiol", "publisher": null, "title": "Significance and utility of plasma brain natriuretic peptide concentrations in patients with idiopathic ven tricular arrhythmias", "url": null, "volume": "29"}, {"authors": [{"name": "F Knebel"}, {"name": "I Schimke"}, {"name": "K Pliet"}, {"name": "S Schattke"}, {"name": "S Mart in"}, {"name": "A Borges"}, {"name": "G Baumann"}], "date": "2005", "id": "b11", "index": 11, "issue": null, "journal": "J Card Fail", "publisher": null, "title": "NT-ProBN P in acute heart failure: correlation with invasively measured hemodynamic parameters during recompensation", "url": null, "volume": "11"}, {"authors": [{"name": "R Krittay aphong"}, {"name": "T Boonyasirinant"}, {"name": "P Saiviroonporn"}, {"name": "P Thanapiboonpol"}, {"name": "S Nakyen"}, {"name": "S Udompunturak"}], "date": "2008", "id": "b12", "index": 12, "issue": null, "journal": "J Card Fail", "publisher": null, "title": "Correlation Between NT-pro BNP levels and left ventricular wall stress, sphericity index and extent of myocardial damage: a magnetic resonance imaging study", "url": null, "volume": "14"}, {"authors": [{"name": "S Yuda"}, {"name": "V Khoury"}, {"name": " T Marwick"}], "date": "2002", "id": "b13", "index": 13, "issue": null, "journal": "J Am Coll Cardiol", "publisher": null, "title": "Influence of wall stress and left ventri cular geometry on the accuracy of dobutamine stress echocardiography", "url": null, "volume": "40"}, {"authors": [{"name": "L Krupp"}, {"name": "N Larocca"}, {"name": "J Mu ir-Nash"}, {"name": "A Steinberg"}], "date": "1989", "id": "b14", "index": 14, "issue": null, "journal": "Arch Neurol", "publisher": null, "title": "The fatigue severity sc ale. Application to patients with multiple sclerosis and systemic lupus erythematosus", "url": null, "volume": "46"}, {"authors": [{"name": "F Gustafsson"}, {"name": "F Ste ensgaard-Hansen"}, {"name": "J Badskjaer"}, {"name": "A Poulsen"}, {"name": "P Corell"}, {"name": "P Hildebrandt"}], "date": false, "id": "b15", "index": 15, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}, {"authors": [{"name": "Tad Da"}, {"name": "H"}, {"name": ", Ito To"}, {"name": "S"}, {"name": ", Sh S Inbo" }, {"name": "G Tadokoro"}, {"name": "K Ito"}, {"name": "I"}, {"name": "Has Shi Himoto"}, {"name": "T Miyaji"}, {"name": "K"}, {"name": "Ka"}, {"name": "K Naito"}, {"name": "S No O Oga Ga Gam Mi Mi A A A, O Os Oshi Hi Im M Ma"}, {"name": "S Taniguchi"}, {"name": "K"}], "date": false, "id": "b16", "index": 16, "issue": null, "journal": null, "p ublisher": null, "title": "onc nc centrations n n i i in n n pa p p ti ti tien en ents ts ts w w with h h id id idio io iopa pa path th thic ic ic v v ven en entr tr tricul lar r r arr rr rhy hy hyth th thmi m m as as as", "url": null, "volume": null}, {"authors": [{"name": "K K Kne Ne Nebe"}, {"name": "F"}, {"name": "F"}, {"name": "F"}, {"name": "S S Sch Ch Chim Im Mke Ke Ke I"}, {"name": ","}, {"name": "Pl Pl P Iet T K"}, {"name": "K"}, {"name": "K"}, {"name": "S S Sch Ch Chattk Tk Ke E S"}, {"name": "S"}, {"name": "S Ti Tin N S"}, {"name": "S"}, {"name": "S B B"}], "date": false, "id": "b17", "index": 17, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}, {"authors": [{"name": "S Vickery"}, {"name": "C Price"}, {"name": "R John"}, {"name": "N Abbas"}, {"name": "M Webb"}, {"name": "M Kempson"}, {"name": "E Lamb"}], "da te": "2005", "id": "b18", "index": 18, "issue": null, "journal": "Am J Kidney Dis", "publisher": null, "title": "B-type natriuretic peptide (BNP) and amino-terminal proBNP in patients with CKD: relationship to renal function and left ventricular hypertrophy", "url": null, "volume": "46"}, {"authors": [{"name": "C Van Huls Van Taxis"}, {"name" : "A Wijnmaalen"}, {"name": "D Den Uijl"}, {"name": "M Gawrysiak"}, {"name": "H Putter"}, {"name": "M Schalij"}, {"name": "K Zeppenfeld"}], "date": "2011", "id": "b19", "in dex": 19, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Reversed polarity of bipolar electrograms for predicting a successful ablation site in foca l idiopathic right ventricular outflow tract arrhythmias", "url": null, "volume": "8"}, {"authors": [{"name": "D Penela"}, {"name": "C Van Huls Van Taxis"}, {"name": "L Agu inaga"}, {"name": "J Fernandez-Armenta"}, {"name": "L Mont"}, {"name": "M Castel"}, {"name": "M Heras"}, {"name": "J Tolosana"}, {"name": "M Sitges"}, {"name": "A Ordonez"} , {"name": "J Brugada"}, {"name": "K Zeppenfeld"}, {"name": "A Berruezo"}], "date": "2013", "id": "b20", "index": 20, "issue": null, "journal": "J Am Coll Cardiol", "publis her": null, "title": "Neurohormonal, structural, and functional recovery pattern after premature ventricular complex ablation is independent of structural heart disease sta tus in patients with depressed left ventricular ejection fraction: a prospective multicenter study", "url": null, "volume": "62"}, {"authors": [{"name": "S Niwano"}, {"name": "Y Wakisaka"}, {"name": "H Niwano"}, {"name": "H Fukaya"}, {"name": "S Kurokawa"}, {"name": "M Kiryu"}, {"name": "Y Hatakeyama"}, {"name": "T Izumi"}], "date": "2009", " id": "b21", "index": 21, "issue": null, "journal": "Heart", "publisher": null, "title": "Prognostic significance of frequent premature ventricular contractions originating from the ventricular outflow tract in patients with normal left ventricular function", "url": null, "volume": "95"}, {"authors": [{"name": "L Costello-Boerrigter"}, {"name" : "G Boerrigter"}, {"name": "M Redfield"}, {"name": "R Rodeheffer"}, {"name": "L Urban"}, {"name": "D Mahoney"}, {"name": "S Jacobsen"}, {"name": "D Heublein"}, {"name": "J Burnett"}], "date": "2006", "id": "b22", "index": 22, "issue": null, "journal": "J Am Coll Cardiol", "publisher": null, "title": "Amino-terminal pro-B-type natriuretic pep tide and B-type natriuretic peptide in the general community: determinants and detection of left ventricular dysfunction", "url": null, "volume": "47"}, {"authors": [{"name": "L Co Ostello-Boe Err Rrigter"}, {"name": "G Boerrigter"}, {"name": "Redfield"}, {"name": "M Mm"}, {"name": "R Rodeheffer"}, {"name": ", Ur U Ban"}, {"name": "L Mahoney" }, {"name": "Dw W W"}, {"name": ","}, {"name": "Ja Jacobs Bs Bsen En En"}, {"name": "S S Sj J J Heublein"}, {"name": "D Burnett"}, {"name": "J"}], "date": false, "id": "b23 ", "index": 23, "issue": null, "journal": "mu mu munity ty y: : : de de dete te term rm min in inan", "publisher": null, "title": "Am Am Amino-terminal p p pro r r-B-ty ty type natriuretic peptide an an nd d B-type n na at atri ri riur ur", "url": null, "volume": null}], "date": false, "doi": "10.1161/circep.115.003091", "journal": {"eissn": "1941-3084", "issn": "1941-3149", "issue": null, "name": "Circulation: Arrhythmia and Electrophysiology", "publisher": "Ovid Technologies (Wolters Kluwer Health)", "volume" : null}, "title": "Fatigue as Presenting Symptom and a High Burden of Premature Ventricular Contractions Are Independently Associated with Increased Ventricular Wall Stress in Patients with Normal Left Ventricular Function"}
diff --git a/python_hadoop/tests/files/example_ungrobided.tsv b/python_hadoop/tests/files/example_ungrobided.tsv
new file mode 100644
index 0000000..9263b6f
--- /dev/null
+++ b/python_hadoop/tests/files/example_ungrobided.tsv
@@ -0,0 +1,20 @@
+sha1:23LOSW2QVMKUYXPFZBXQHBBNQR45WTMU {"c": 1, "d": "2017-10-27T22:21:13", "f": "PDFS-20171027214658-00155.warc.gz", "o": 984263791, "u": "http://circ.ahajournals.org/content/circulationaha/53/6/965.full.pdf"} application/pdf {"c_size": 1050532, "dt": "20171027222113", "offset": 984263791, "surt": "org,ahajournals,circ)/content/circulationaha/53/6/965.full.pdf", "url": "http://circ.ahajournals.org/content/circulationaha/53/6/965.full.pdf", "warc": "PDFS-20171027125450-crawl815/PDFS-20171027214658-00155.warc.gz"}
+sha1:23M2N262M5TWB7F3BVB6ESD3Q26SMPFA {"c": 1, "d": "2012-09-29T07:05:16", "f": "ARCHIVEIT-219-QUARTERLY-FWGZDI-20120929065657-00119-crawling203.us.archive.org-6680.warc.gz", "o": 83570746, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=37&id=225&artlang=en"} application/pdf {"c_size": 3590, "dt": "20120929070516", "offset": 83570746, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=37&id=225", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=37&id=225&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-FWGZDI-00001/ARCHIVEIT-219-QUARTERLY-FWGZDI-20120929065657-00119-crawling203.us.archive.org-6680.warc.gz"}
+sha1:23MFQLDGP4WJD67BS7ERMYQUF7TGCG5X {"c": 1, "d": "2017-08-25T15:19:28", "f": "MSAG-PDF-CRAWL-2017-08-04-20170825143335512-08107-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", "o": 573475485, "u": "http://www.bloodjournal.org/content/bloodjournal/77/7/1484.full.pdf?sso-checked=true"} application/pdf {"c_size": 3411470, "dt": "20170825151928", "offset": 573475485, "surt": "org,bloodjournal)/content/bloodjournal/77/7/1484.full.pdf?sso-checked=true", "url": "http://www.bloodjournal.org/content/bloodjournal/77/7/1484.full.pdf?sso-checked=true", "warc": "MSAG-PDF-CRAWL-2017-08-04-20170825114428485-08102-08111-wbgrp-svc284/MSAG-PDF-CRAWL-2017-08-04-20170825143335512-08107-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"}
+sha1:23MG6K5Z3JENYCZ2OGTNOJ7QPYANJRCZ {"c": 1, "d": "2017-10-10T11:29:50", "f": "PDFS-20171010110639-00278.warc.gz", "o": 665222706, "u": "http://circ.ahajournals.org/content/circulationaha/53/5/797.full.pdf?download=true"} application/pdf {"c_size": 1107121, "dt": "20171010112950", "offset": 665222706, "surt": "org,ahajournals,circ)/content/circulationaha/53/5/797.full.pdf?download=true", "url": "http://circ.ahajournals.org/content/circulationaha/53/5/797.full.pdf?download=true", "warc": "PDFS-20171010110639-crawl815/PDFS-20171010110639-00278.warc.gz"}
+sha1:23MN67JQKDWRUXJMXXJ2GX6O43SQIV76 {"c": 1, "d": "2015-06-06T18:17:08", "f": "ARCHIVEIT-219-QUARTERLY-9582-20150606125000169-00071-wbgrp-crawl067.us.archive.org-6440.warc.gz", "o": 603211220, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=95&artlang=en"} application/pdf {"c_size": 3450, "dt": "20150606181708", "offset": 603211220, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=36&id=95", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=95&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-9582-00007/ARCHIVEIT-219-QUARTERLY-9582-20150606125000169-00071-wbgrp-crawl067.us.archive.org-6440.warc.gz"}
+sha1:23MQCEOMQS5SMZCXIPNQ4E3ZKOCF6DIM {"c": 1, "d": "2004-01-02T19:45:22", "f": "DU_crawl10.20040102194453.arc.gz", "o": 38062592, "u": "http://www.csupomona.edu:80/~darsadmin/ACTIONITEMS.pdf"} application/pdf {"c_size": 15406, "dt": "20040102194522", "offset": 38062592, "surt": "edu,csupomona)/~darsadmin/actionitems.pdf", "url": "http://www.csupomona.edu:80/~darsadmin/ACTIONITEMS.pdf", "warc": "DU_crawl10.20040102181929-c/DU_crawl10.20040102194453.arc.gz"}
+sha1:23NKO4TW6XCESXMSUOOICI3AXVK6Z5BL {"c": 1, "d": "2015-04-27T11:16:33", "f": "eric.ed.gov-inf-20150409-030712-1648j-00064.warc.gz", "o": 3872820483, "u": "http://files.eric.ed.gov/fulltext/ED088632.pdf"} application/pdf {"c_size": 2223528, "dt": "20150427111633", "offset": 3872820483, "surt": "gov,ed,eric,files)/fulltext/ed088632.pdf", "url": "http://files.eric.ed.gov/fulltext/ED088632.pdf", "warc": "archiveteam_archivebot_go_20150427150006/eric.ed.gov-inf-20150409-030712-1648j-00064.warc.gz"}
+sha1:23NW2EPLXDA6UBIJLQMM2DJ2K3GL3WTB {"c": 1, "d": "2014-08-13T09:04:30", "f": "WIDE-20140813084304-09684.warc.gz", "o": 726289594, "u": "http://research.sdccd.edu/docs/Accreditation/2012%20Surveys/Employee%20-%20Briefing/Mesa%20College%202012%20Employee%20Feedback%20Survey%20Briefing.pdf"} application/pdf {"c_size": 3472527, "dt": "20140813090430", "offset": 726289594, "surt": "edu,sdccd,research)/docs/accreditation/2012%20surveys/employee%20-%20briefing/mesa%20college%202012%20employee%20feedback%20survey%20briefing.pdf", "url": "http://research.sdccd.edu/docs/Accreditation/2012%20Surveys/Employee%20-%20Briefing/Mesa%20College%202012%20Employee%20Feedback%20Survey%20Briefing.pdf", "warc": "WIDE-20140813074743-crawl424/WIDE-20140813084304-09684.warc.gz"}
+sha1:23OQICQ4IBVNHJBWJX5ON3QR26KNMQNT {"c": 1, "d": "2010-06-29T00:35:17", "f": "EDG-20100628234135-01241-ia360918.us.archive.org.warc.gz", "o": 572430160, "u": "http://journalism.arizona.edu/news/rockypt6.pdf"} application/pdf {"c_size": 194706, "dt": "20100629003517", "offset": 572430160, "surt": "edu,arizona,journalism)/news/rockypt6.pdf", "url": "http://journalism.arizona.edu/news/rockypt6.pdf", "warc": "EDG-20100628214935-01235-01243-ia360918-20100629023741-00000/EDG-20100628234135-01241-ia360918.us.archive.org.warc.gz"}
+sha1:23OT2AAYPJ3Z5ZOQXVJJTVTKY6QUPICI {"c": 1, "d": "2007-02-25T17:49:01", "f": "38_0_20070225174831_crawl28.arc.gz", "o": 93868066, "u": "http://www.ece.tufts.edu:80/~hopwood/tampa-proceedings.pdf"} application/pdf {"c_size": 162157, "dt": "20070225174901", "offset": 93868066, "surt": "edu,tufts,ece)/~hopwood/tampa-proceedings.pdf", "url": "http://www.ece.tufts.edu:80/~hopwood/tampa-proceedings.pdf", "warc": "38_0_20070225173722_crawl28-c/38_0_20070225174831_crawl28.arc.gz"}
+sha1:23OUFX3ZYMF53HY4RUONR5PKN4HXN4O3 {"c": 1, "d": "2004-05-26T06:45:34", "f": "DW_crawl10.20040526064432.arc.gz", "o": 67593910, "u": "http://207.36.165.114:80/NewOrleans/Papers/1301466.pdf"} application/pdf {"c_size": 306879, "dt": "20040526064534", "offset": 67593910, "surt": "114,165,36,207)/neworleans/papers/1301466.pdf", "url": "http://207.36.165.114:80/NewOrleans/Papers/1301466.pdf", "warc": "DW_crawl10.20040525230808-c/DW_crawl10.20040526064432.arc.gz"}
+sha1:23PA23UIWCBA3CSTDK2JYX7ZIVOHULFG {"c": 1, "d": "2016-02-05T21:48:33", "f": "NLNZ-NZ-CRAWL-005-20160205211003375-02839-6291~wbgrp-crawl007.us.archive.org~8443.warc.gz", "o": 630386943, "u": "http://homepages.engineering.auckland.ac.nz/~smohan/Outreach/Docs/2013/TTU_REU2013.pdf"} application/pdf {"c_size": 2979614, "dt": "20160205214833", "offset": 630386943, "surt": "nz,ac,auckland,engineering,homepages)/~smohan/outreach/docs/2013/ttu_reu2013.pdf", "url": "http://homepages.engineering.auckland.ac.nz/~smohan/Outreach/Docs/2013/TTU_REU2013.pdf", "warc": "NLNZ-NZ-CRAWL-005-20160205211003375-02839-02848-wbgrp-crawl007/NLNZ-NZ-CRAWL-005-20160205211003375-02839-6291~wbgrp-crawl007.us.archive.org~8443.warc.gz"}
+sha1:23PGC74CTD7P6PCF3MZZZJMPYFXRK3OB {"c": 1, "d": "2005-03-17T15:05:51", "f": "EC_binary1_crawl30.20050317150502.arc.gz", "o": 75675778, "u": "http://www.csupomona.edu:80/%7Eengineering/programs/courses/aro/course_outlines/aro_407.pdf"} application/pdf {"c_size": 4842, "dt": "20050317150551", "offset": 75675778, "surt": "edu,csupomona)/~engineering/programs/courses/aro/course_outlines/aro_407.pdf", "url": "http://www.csupomona.edu:80/%7Eengineering/programs/courses/aro/course_outlines/aro_407.pdf", "warc": "EC_binary1_crawl30.20050317135651-c/EC_binary1_crawl30.20050317150502.arc.gz"}
+sha1:23PKJEQWUJAIQQSLP3GCCC5VDXN4RFCX {"c": 1, "d": "2017-10-10T23:50:37", "f": "WIDE-20171010214240-16560.warc.gz", "o": 962106404, "u": "http://www.nbrb.by/bv/articles/8997.pdf"} application/pdf {"c_size": 273375, "dt": "20171010235037", "offset": 962106404, "surt": "by,nbrb)/bv/articles/8997.pdf", "url": "http://www.nbrb.by/bv/articles/8997.pdf", "warc": "WIDE-20171010202419-crawl424/WIDE-20171010214240-16560.warc.gz"}
+sha1:23PRILJUIQUKHRYQIUYAKSBFPH53FOGT {"c": 1, "d": "2017-07-14T18:51:38", "f": "WIDE-20170714181144-06521.warc.gz", "o": 820382225, "u": "http://carsandracingstuff.com/library/articles/32538.pdf"} application/pdf {"c_size": 125426, "dt": "20170714185138", "offset": 820382225, "surt": "com,carsandracingstuff)/library/articles/32538.pdf", "url": "http://carsandracingstuff.com/library/articles/32538.pdf", "warc": "WIDE-20170714174218-crawl426/WIDE-20170714181144-06521.warc.gz"}
+sha1:23PTUXWSNSVE4HS5J7ELDUUG63J2FPCI {"c": 1, "d": "2016-06-09T00:27:36", "f": "WIDE-20160609001810-06993.warc.gz", "o": 287880616, "u": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf"} application/pdf {"c_size": 68262, "dt": "20160609002736", "offset": 287880616, "surt": "eu,case-research)/sites/default/files/publications/18092393_e-brief_dabrowski_monetary_policy_final_0.pdf", "url": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf", "warc": "WIDE-20160609000312-crawl427/WIDE-20160609001810-06993.warc.gz"}
+sha1:23PW2APYHNBPIBRIVNQ6TMKUNY53UL3D {"c": 1, "d": "2016-01-07T03:29:03", "f": "MUSEUM-20160107025230-02354.warc.gz", "o": 413484441, "u": "http://www.portlandoregon.gov/fire/article/363695"} application/pdf {"c_size": 44600, "dt": "20160107032903", "offset": 413484441, "surt": "gov,portlandoregon)/fire/article/363695", "url": "http://www.portlandoregon.gov/fire/article/363695", "warc": "MUSEUM-20160107004301-crawl891/MUSEUM-20160107025230-02354.warc.gz"}
+sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT {"c": 1, "d": "2014-06-07T18:00:56", "f": "ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz", "o": 720590380, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en"} application/pdf {"c_size": 3727, "dt": "20140607180056", "offset": 720590380, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=36&id=264", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-20047-00001/ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz"}
+sha1:23SMLYPFEGIRV6M37FJ5D364TXQXCSMR {"c": 1, "d": "2011-07-12T22:20:32", "f": "WIDE-20110712221302-03146.warc.gz", "o": 222089710, "u": "http://media.dailyuw.com/papers/_091030_7-14_color_web.pdf"} application/pdf {"c_size": 4654708, "dt": "20110712222032", "offset": 222089710, "surt": "com,dailyuw,media)/papers/_091030_7-14_color_web.pdf", "url": "http://media.dailyuw.com/papers/_091030_7-14_color_web.pdf", "warc": "WIDE-20110712221302-crawl413/WIDE-20110712221302-03146.warc.gz"}
+sha1:23SN4XBPSCRPRIHH5UAV45LFCP3VDV3V {"c": 1, "d": "2010-10-28T09:03:57", "f": "WIDE-20101028084449158-00409-23450~ia360921.us.archive.org~9443.warc.gz", "o": 756726028, "u": "http://cdacnoida.in/ASCNT-2010/Language%20Technology/Paper/Reducing%20Errors%20in%20Translation%20using%20Pre-editor%20for%20Indian%20English%20Sentences.pdf"} application/pdf {"c_size": 98408, "dt": "20101028090357", "offset": 756726028, "surt": "in,cdacnoida)/ascnt-2010/language%20technology/paper/reducing%20errors%20in%20translation%20using%20pre-editor%20for%20indian%20english%20sentences.pdf", "url": "http://cdacnoida.in/ASCNT-2010/Language%20Technology/Paper/Reducing%20Errors%20in%20Translation%20using%20Pre-editor%20for%20Indian%20English%20Sentences.pdf", "warc": "WIDE-20101028063239344-00397-00415-ia360921/WIDE-20101028084449158-00409-23450~ia360921.us.archive.org~9443.warc.gz"}
diff --git a/mapreduce/tests/files/small.json b/python_hadoop/tests/files/small.json
index 208fb49..7c75187 100644
--- a/mapreduce/tests/files/small.json
+++ b/python_hadoop/tests/files/small.json
@@ -1,8 +1,8 @@
{
"title": "Dummy Example File",
"authors": [
- {"name": "Brewster Kahle"},
- {"name": "J Doe"}
+ {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"},
+ {"name": "J Doe", "given_name": "J", "surname": "Doe"}
],
"journal": {
"name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
@@ -15,7 +15,7 @@
"date": "2000",
"doi": null,
"citations": [
- { "authors": [{"name": "A Seaperson"}],
+ { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
"date": "2001",
"id": "b0",
"index": 0,
@@ -39,5 +39,8 @@
"abstract": "Everything you ever wanted to know about nothing",
"body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
"acknowledgement": null,
- "annex": null
+ "annex": null,
+ "fatcat_release": null,
+ "grobid_timestamp": "2018-04-02T00:31+0000",
+ "grobid_version": "0.5.1-SNAPSHOT"
}
diff --git a/mapreduce/tests/files/small.xml b/python_hadoop/tests/files/small.xml
index 78b9ba2..78b9ba2 100644
--- a/mapreduce/tests/files/small.xml
+++ b/python_hadoop/tests/files/small.xml
diff --git a/mapreduce/tests/test_backfill_hbase_from_cdx.py b/python_hadoop/tests/test_backfill_hbase_from_cdx.py
index 070662b..070662b 100644
--- a/mapreduce/tests/test_backfill_hbase_from_cdx.py
+++ b/python_hadoop/tests/test_backfill_hbase_from_cdx.py
diff --git a/mapreduce/tests/test_common.py b/python_hadoop/tests/test_common.py
index 34d50ed..34d50ed 100644
--- a/mapreduce/tests/test_common.py
+++ b/python_hadoop/tests/test_common.py
diff --git a/mapreduce/tests/test_extraction_cdx_grobid.py b/python_hadoop/tests/test_extraction_cdx_grobid.py
index 1bf2420..471d94a 100644
--- a/mapreduce/tests/test_extraction_cdx_grobid.py
+++ b/python_hadoop/tests/test_extraction_cdx_grobid.py
@@ -8,7 +8,7 @@ import responses
import happybase_mock
import wayback.exception
from unittest import mock
-from extraction_cdx_grobid import MRExtractCdxGrobid, Resource
+from extraction_cdx_grobid import MRExtractCdxGrobid
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
diff --git a/python_hadoop/tests/test_extraction_ungrobided.py b/python_hadoop/tests/test_extraction_ungrobided.py
new file mode 100644
index 0000000..cb46d29
--- /dev/null
+++ b/python_hadoop/tests/test_extraction_ungrobided.py
@@ -0,0 +1,178 @@
+
+import io
+import json
+import mrjob
+import pytest
+import struct
+import responses
+import happybase_mock
+import wayback.exception
+from unittest import mock
+from common import parse_ungrobided_line
+from extraction_ungrobided import MRExtractUnGrobided
+
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+OK_UNGROBIDED_LINE = b"\t".join((
+ b"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ",
+ b"""{"c": 1, "d": "2017-07-06T07:54:11", "f": "CITESEERX-CRAWL-2017-06-20-20170706075012840-00388-3671~wbgrp-svc285.us.archive.org~8443.warc.gz", "o": 914718776, "u": "http://www.ibc7.org/article/file_down.php?mode%3Darticle_print%26pid%3D250"}""",
+ b"application/pdf",
+ b"""{"c_size": 501, "dt": "20170706075411", "offset": 914718776, "surt": "org,ibc7)/article/file_down.php?mode=article_print&pid=250", "url": "http://www.ibc7.org/article/file_down.php?mode%3Darticle_print%26pid%3D250", "warc": "CITESEERX-CRAWL-2017-06-20-20170706074206206-00379-00388-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706075012840-00388-3671~wbgrp-svc285.us.archive.org~8443.warc.gz"}""",
+))
+
+with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'r') as f:
+ REAL_TEI_XML = f.read()
+
+@pytest.fixture
+def job():
+ """
+ Note: this mock only seems to work with job.run_mapper(), not job.run();
+ the later results in a separate instantiation without the mock?
+ """
+ job = MRExtractUnGrobided(['--no-conf', '-'])
+
+ conn = happybase_mock.Connection()
+ conn.create_table('wbgrp-journal-extract-test',
+ {'file': {}, 'grobid0': {}, 'f': {}})
+ job.hb_table = conn.table('wbgrp-journal-extract-test')
+
+ return job
+
+
+@mock.patch('extraction_ungrobided.MRExtractUnGrobided.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_single_line(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ raw = io.BytesIO(OK_UNGROBIDED_LINE)
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+
+ job.run_mapper()
+
+ # for debugging tests
+ #print(output.getvalue().decode('utf-8'))
+ #print(list(job.hb_table.scan()))
+
+ # wayback gets FETCH 1x times
+ mock_fetch.assert_called_once_with(
+ "CITESEERX-CRAWL-2017-06-20-20170706074206206-00379-00388-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706075012840-00388-3671~wbgrp-svc285.us.archive.org~8443.warc.gz",
+ 914718776,
+ 501)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ # HBase
+ assert job.hb_table.row(b'1') == {}
+
+ # Saved extraction info
+ row = job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ')
+
+ assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)
+ # file:mime should actually not get clobbered by GROBID updater
+ #assert row[b'file:mime'] == b"application/pdf"
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ # TODO: assert row[b'grobid0:quality'] == None
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert type(status) == type(dict())
+ assert row[b'grobid0:tei_xml'].decode('utf-8') == REAL_TEI_XML
+ tei_json = json.loads(row[b'grobid0:tei_json'].decode('utf-8'))
+ metadata = json.loads(row[b'grobid0:metadata'].decode('utf-8'))
+ assert tei_json['title'] == metadata['title']
+ assert 'body' in tei_json
+ assert 'body' not in metadata
+
+@mock.patch('extraction_ungrobided.MRExtractUnGrobided.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_lines(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ raw = io.BytesIO(b"""sha1:23PTUXWSNSVE4HS5J7ELDUUG63J2FPCI\t{"c": 1, "d": "2016-06-09T00:27:36", "f": "WIDE-20160609001810-06993.warc.gz", "o": 287880616, "u": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf"}\tapplication/pdf\t{"c_size": 68262, "dt": "20160609002736", "offset": 287880616, "surt": "eu,case-research)/sites/default/files/publications/18092393_e-brief_dabrowski_monetary_policy_final_0.pdf", "url": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf", "warc": "WIDE-20160609000312-crawl427/WIDE-20160609001810-06993.warc.gz"}
+sha1:23PW2APYHNBPIBRIVNQ6TMKUNY53UL3D\t{"c": 1, "d": "2016-01-07T03:29:03", "f": "MUSEUM-20160107025230-02354.warc.gz", "o": 413484441, "u": "http://www.portlandoregon.gov/fire/article/363695"}\tapplication/pdf\t{"c_size": 44600, "dt": "20160107032903", "offset": 413484441, "surt": "gov,portlandoregon)/fire/article/363695", "url": "http://www.portlandoregon.gov/fire/article/363695", "warc": "MUSEUM-20160107004301-crawl891/MUSEUM-20160107025230-02354.warc.gz"}
+sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT\t{"c": 1, "d": "2014-06-07T18:00:56", "f": "ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz", "o": 720590380, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en"}\tapplication/pdf\t{"c_size": 3727, "dt": "20140607180056", "offset": 720590380, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=36&id=264", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-20047-00001/ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz"}""")
+
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+
+ job.run_mapper()
+
+ # for debugging tests
+ #print(output.getvalue().decode('utf-8'))
+ #print(list(job.hb_table.scan()))
+
+ # grobid gets POST 3x times
+ assert len(responses.calls) == 3
+
+ # wayback gets FETCH 3x times
+ mock_fetch.assert_has_calls((
+ mock.call("WIDE-20160609000312-crawl427/WIDE-20160609001810-06993.warc.gz", 287880616, 68262),
+ mock.call("MUSEUM-20160107004301-crawl891/MUSEUM-20160107025230-02354.warc.gz", 413484441, 44600),
+ mock.call("ARCHIVEIT-219-QUARTERLY-20047-00001/ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz", 720590380, 3727),
+ ))
+
+ # Saved extraction info
+ assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
+ assert job.hb_table.row(b'sha1:23PTUXWSNSVE4HS5J7ELDUUG63J2FPCI') != {}
+ assert job.hb_table.row(b'sha1:23PW2APYHNBPIBRIVNQ6TMKUNY53UL3D') != {}
+ assert job.hb_table.row(b'sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT') != {}
+
+ row = job.hb_table.row(b'sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT')
+ assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)
+ # file:mime should actually not get clobbered by GROBID updater
+ #assert row[b'file:mime'] == b"application/pdf"
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert type(status) == type(dict())
+ assert row[b'grobid0:tei_xml'].decode('utf-8') == REAL_TEI_XML
+ tei_json = json.loads(row[b'grobid0:tei_json'].decode('utf-8'))
+ metadata = json.loads(row[b'grobid0:metadata'].decode('utf-8'))
+ assert tei_json['title'] == metadata['title']
+ assert 'body' in tei_json
+ assert 'body' not in metadata
+
+def test_parse_ungrobided_invalid(job):
+
+ print("space-prefixed line")
+ raw = " com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("commented line")
+ raw = "#com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("wrong column count")
+ raw = "a b c d e"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'parse' in status['reason']
+
+ print("CDX line, somehow")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf - 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ print(status)
+ assert status['status'] == "invalid"
+ assert 'parse' in status['reason']
+
+def test_parse_ungrobided_valid():
+
+ parsed = parse_ungrobided_line(OK_UNGROBIDED_LINE.decode('utf-8'))
+ assert parsed['key'] == "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"
+ assert parsed['f:c']['u'] == "http://www.ibc7.org/article/file_down.php?mode%3Darticle_print%26pid%3D250"
+ assert parsed['file:mime'] == "application/pdf"
+ assert parsed['file:cdx']['c_size'] == 501
+ assert parsed['file:cdx']['dt'] == "20170706075411"
diff --git a/python_hadoop/tests/test_grobid2json.py b/python_hadoop/tests/test_grobid2json.py
new file mode 100644
index 0000000..8497b10
--- /dev/null
+++ b/python_hadoop/tests/test_grobid2json.py
@@ -0,0 +1,22 @@
+
+import xml
+import json
+import pytest
+from grobid2json import *
+
+
+def test_small_xml():
+
+ with open('tests/files/small.xml', 'r') as f:
+ tei_xml = f.read()
+ with open('tests/files/small.json', 'r') as f:
+ json_form = json.loads(f.read())
+
+ assert teixml2json(tei_xml) == json_form
+
+def test_invalid_xml():
+
+ with pytest.raises(xml.etree.ElementTree.ParseError):
+ teixml2json("this is not XML")
+ with pytest.raises(ValueError):
+ teixml2json("<xml></xml>")
diff --git a/sandcrawler-rfc.md b/sandcrawler-rfc.md
new file mode 100644
index 0000000..fea6a7c
--- /dev/null
+++ b/sandcrawler-rfc.md
@@ -0,0 +1,180 @@
+
+**Title:** Journal Archiving Pipeline
+
+**Author:** Bryan Newbold <bnewbold@archive.org>
+
+**Date:** March 2018
+
+**Status:** work-in-progress
+
+This is an RFC-style technical proposal for a journal crawling, archiving,
+extracting, resolving, and cataloging pipeline.
+
+Design work funded by a Mellon Foundation grant in 2018.
+
+## Overview
+
+Let's start with data stores first:
+
+- crawled original fulltext (PDF, JATS, HTML) ends up in petabox/global-wayback
+- file-level extracted fulltext and metadata is stored in HBase, with the hash
+ of the original file as the key
+- cleaned metadata is stored in a "catalog" relational (SQL) database (probably
+ PostgreSQL or some hip scalable NewSQL thing compatible with Postgres or
+ MariaDB)
+
+**Resources:** back-of-the-envelope, around 100 TB petabox storage total (for
+100 million PDF files); 10-20 TB HBase table total. Can start small.
+
+
+All "system" (aka, pipeline) state (eg, "what work has been done") is ephemeral
+and is rederived relatively easily (but might be cached for performance).
+
+The overall "top-down", metadata-driven cycle is:
+
+1. Partners and public sources provide metadata (for catalog) and seed lists
+ (for crawlers)
+2. Crawlers pull in fulltext and HTTP/HTML metadata from the public web
+3. Extractors parse raw fulltext files (PDFs) and store structured metadata (in
+ HBase)
+4. Data Mungers match extracted metadata (from HBase) against the catalog, or
+ create new records if none found.
+
+In the "bottom up" cycle, batch jobs run as map/reduce jobs against the
+catalog, HBase, global wayback, and partner metadata datasets to identify
+potential new public or already-archived content to process, and pushes tasks
+to the crawlers, extractors, and mungers.
+
+## Partner Metadata
+
+Periodic Luigi scripts run on a regular VM to pull in metadata from partners.
+All metadata is saved to either petabox (for public stuff) or HDFS (for
+restricted). Scripts process/munge the data and push directly to the catalog
+(for trusted/authoritative sources like Crossref, ISSN, PubMed, DOAJ); others
+extract seedlists and push to the crawlers (
+
+**Resources:** 1 VM (could be a devbox), with a large attached disk (spinning
+probably ok)
+
+## Crawling
+
+All fulltext content comes in from the public web via crawling, and all crawled
+content ends up in global wayback.
+
+One or more VMs serve as perpetual crawlers, with multiple active ("perpetual")
+Heritrix crawls operating with differing configuration. These could be
+orchestrated (like h3), or just have the crawl jobs cut off and restarted every
+year or so.
+
+In a starter configuration, there would be two crawl queues. One would target
+direct PDF links, landing pages, author homepages, DOI redirects, etc. It would
+process HTML and look for PDF outlinks, but wouldn't crawl recursively.
+
+HBase is used for de-dupe, with records (pointers) stored in WARCs.
+
+A second config would take seeds as entire journal websites, and would crawl
+continously.
+
+Other components of the system "push" tasks to the crawlers by copying schedule
+files into the crawl action directories.
+
+WARCs would be uploaded into petabox via draintasker as usual, and CDX
+derivation would be left to the derive process. Other processes are notified of
+"new crawl content" being available when they see new unprocessed CDX files in
+items from specific collections. draintasker could be configured to "cut" new
+items every 24 hours at most to ensure this pipeline moves along regularly, or
+we could come up with other hacks to get lower "latency" at this stage.
+
+**Resources:** 1-2 crawler VMs, each with a large attached disk (spinning)
+
+### De-Dupe Efficiency
+
+We would certainly feed CDX info from all bulk journal crawling into HBase
+before any additional large crawling, to get that level of de-dupe.
+
+As to whether all GWB PDFs should be de-dupe against is a policy question: is
+there something special about the journal-specific crawls that makes it worth
+having second copies? Eg, if we had previously domain crawled and access is
+restricted, we then wouldn't be allowed to provide researcher access to those
+files... on the other hand, we could extract for researchers given that we
+"refound" the content at a new URL?
+
+Only fulltext files (PDFs) would be de-duped against (by content), so we'd be
+recrawling lots of HTML. Presumably this is a fraction of crawl data size; what
+fraction?
+
+Watermarked files would be refreshed repeatedly from the same PDF, and even
+extracted/processed repeatedly (because the hash would be different). This is
+hard to de-dupe/skip, because we would want to catch "content drift" (changes
+in files).
+
+## Extractors
+
+Off-the-shelf PDF extraction software runs on high-CPU VM nodes (probably
+GROBID running on 1-2 data nodes, which have 30+ CPU cores and plenty of RAM
+and network throughput).
+
+A hadoop streaming job (written in python) takes a CDX file as task input. It
+filters for only PDFs, and then checks each line against HBase to see if it has
+already been extracted. If it hasn't, the script downloads directly from
+petabox using the full CDX info (bypassing wayback, which would be a
+bottleneck). It optionally runs any "quick check" scripts to see if the PDF
+should be skipped ("definitely not a scholarly work"), then if it looks Ok
+submits the file over HTTP to the GROBID worker pool for extraction. The
+results are pushed to HBase, and a short status line written to Hadoop. The
+overall Hadoop job has a reduce phase that generates a human-meaningful report
+of job status (eg, number of corrupt files) for monitoring.
+
+A side job as part of extracting can "score" the extracted metadata to flag
+problems with GROBID, to be used as potential training data for improvement.
+
+**Resources:** 1-2 datanode VMs; hadoop cluster time. Needed up-front for
+backlog processing; less CPU needed over time.
+
+## Matchers
+
+The matcher runs as a "scan" HBase map/reduce job over new (unprocessed) HBasej
+rows. It pulls just the basic metadata (title, author, identifiers, abstract)
+and calls the catalog API to identify potential match candidates. If no match
+is found, and the metadata "look good" based on some filters (to remove, eg,
+spam), works are inserted into the catalog (eg, for those works that don't have
+globally available identifiers or other metadata; "long tail" and legacy
+content).
+
+**Resources:** Hadoop cluster time
+
+## Catalog
+
+The catalog is a versioned relational database. All scripts interact with an
+API server (instead of connecting directly to the database). It should be
+reliable and low-latency for simple reads, so it can be relied on to provide a
+public-facing API and have public web interfaces built on top. This is in
+contrast to Hadoop, which for the most part could go down with no public-facing
+impact (other than fulltext API queries). The catalog does not contain
+copywritable material, but it does contain strong (verified) links to fulltext
+content. Policy gets implemented here if necessary.
+
+A global "changelog" (append-only log) is used in the catalog to record every
+change, allowing for easier replication (internal or external, to partners). As
+little as possible is implemented in the catalog itself; instead helper and
+cleanup bots use the API to propose and verify edits, similar to the wikidata
+and git data models.
+
+Public APIs and any front-end services are built on the catalog. Elasticsearch
+(for metadata or fulltext search) could build on top of the catalog.
+
+**Resources:** Unknown, but estimate 1+ TB of SSD storage each on 2 or more
+database machines
+
+## Machine Learning and "Bottom Up"
+
+TBD.
+
+## Logistics
+
+Ansible is used to deploy all components. Luigi is used as a task scheduler for
+batch jobs, with cron to initiate periodic tasks. Errors and actionable
+problems are aggregated in Sentry.
+
+Logging, metrics, and other debugging and monitoring are TBD.
+
diff --git a/scalding/README.md b/scalding/README.md
index 45b62d0..b09e0e8 100644
--- a/scalding/README.md
+++ b/scalding/README.md
@@ -1,12 +1,13 @@
This directory contains Hadoop map/reduce jobs written in Scala (compiled to
-the JVM) using the Scalding framework.
+the JVM) using the Scalding framework. Scalding builds on the Java Cascading
+library, which itself builds on the Java Hadoop libraries.
See the other markdown files in this directory for more background and tips.
## Dependencies
-Locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build tool, and
-might need (exactly) Scala version 2.11.8.
+To develop locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build
+tool, and might need (exactly) Scala version 2.11.8.
On a debian/ubuntu machine:
@@ -15,24 +16,32 @@ On a debian/ubuntu machine:
sudo apt-get update
sudo apt install scala sbt
+It's also helpful to have a local copy of the `hadoop` binary for running
+benchmarks. The `fetch_hadoop.sh` script in the top level directory will fetch
+an appropriate version.
+
## Building and Running
-Run tests:
+You can run `sbt` commands individually:
+ # run all test
sbt test
-Build a jar and upload to a cluster machine (from which to run in production):
-
+ # build a jar (also runs tests)
sbt assembly
- scp target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar devbox:
-Run on cluster:
+Or you can start a session and run commands within that, which is *much*
+faster:
+
+ sbt -mem 2048
+
+ sbt> test
+ sbt> assembly
+ sbt> testOnly sandcrawler.SomeTestClassName
- devbox$ touch thing.conf
- devbox$ hadoop jar sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
- com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \
- --app.conf.path thing.conf \
- --output hdfs:///user/bnewbold/spyglass_out_test
+On the cluster, you usually use the `please` script to kick off jobs. Be sure
+to build the jars first, or pass `--rebuild` to do it automatically. You need
+`hadoop` on your path for this.
## Troubleshooting
@@ -42,3 +51,4 @@ If your `sbt` task fails with this error:
try restarting `sbt` with more memory (e.g., `sbt -mem 2048`).
+See `scalding-debugging.md` or maybe `../notes/` for more.
diff --git a/scalding/build.sbt b/scalding/build.sbt
index 980418c..01f55ca 100644
--- a/scalding/build.sbt
+++ b/scalding/build.sbt
@@ -20,6 +20,13 @@ lazy val root = (project in file(".")).
scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude))
},
+ (scalastyleSources in Test) := {
+ // all .scala files in "src/test/scala"
+ val scalaSourceFiles = ((scalaSource in Test).value ** "*.scala").get
+ val dirNameToExclude = "/example/"
+ scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude))
+ },
+
name := "sandcrawler",
resolvers += "conjars.org" at "http://conjars.org/repo",
@@ -35,7 +42,7 @@ lazy val root = (project in file(".")).
libraryDependencies += "org.apache.hadoop" % "hadoop-client" % hadoopVersion,
libraryDependencies += "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests",
libraryDependencies += "org.apache.hbase" % "hbase-common" % hbaseVersion,
- libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.11_0.17.2_cdh5.3.1",
+ libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.11_0.17.2_cdh5.3.1-p1",
// cargo-culted from twitter/scalding's build.sbt
// hint via https://stackoverflow.com/questions/23280494/sbt-assembly-error-deduplicate-different-file-contents-found-in-the-following#23280952
@@ -55,4 +62,5 @@ lazy val root = (project in file(".")).
case x => (assemblyMergeStrategy in assembly).value(x)
},
+ testOptions in Test += Tests.Argument("-oF")
)
diff --git a/scalding/scalastyle-config.xml b/scalding/scalastyle-config.xml
index 86d8fca..47d0feb 100644
--- a/scalding/scalastyle-config.xml
+++ b/scalding/scalastyle-config.xml
@@ -35,7 +35,7 @@
<check level="warning" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
<check level="warning" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
<check level="warning" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
- <check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
+ <check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="false">
<parameters>
<parameter name="maxLineLength"><![CDATA[160]]></parameter>
<parameter name="tabSize"><![CDATA[4]]></parameter>
diff --git a/scalding/scalding-debugging.md b/scalding/scalding-debugging.md
index bd9dd36..5a54742 100644
--- a/scalding/scalding-debugging.md
+++ b/scalding/scalding-debugging.md
@@ -83,3 +83,13 @@ Values of type `List[Fields]` are not printed in the expected way:
scala> allFields.length
res0: Int = 2
+
+## SpyGlass Column Selection
+
+Two equivalent ways to specify `columns`/`column_families`:
+
+ List("f", "file"),
+ List(new Fields("c"), new Fields("size", "mimetype")),
+
+ List("f", "file", "file")
+ List(new Fields("c"), new Fields("size"), new Fields("mimetype")),
diff --git a/scalding/src/main/resources/slug-denylist.txt b/scalding/src/main/resources/slug-denylist.txt
new file mode 100644
index 0000000..926dbd5
--- /dev/null
+++ b/scalding/src/main/resources/slug-denylist.txt
@@ -0,0 +1,554 @@
+abbreviations
+abbreviationsandacronyms
+aboutauthors
+abouttheauthor
+abouttheauthors
+aboutthecover
+abouttheeditors
+abreviations
+abstract
+abstractnotsubmittedforonlinepublication
+abstracts
+abstractsofaapaposterandpodiumpresentations
+abstractsofcommunications
+abstractsofthesesfromthescandinaviancountries
+abstractwithdrawn
+acknowledgement
+acknowledgements
+acknowledgementsvii
+acknowledgementtoreferees
+acknowledgementtoreviewers
+acknowledgment
+acknowledgmentofreferees
+acknowledgments
+addendum
+additionalresources
+address
+advertisersindex
+affect
+affiliation
+afterword
+agenda
+agradecimentos
+agradecimientos
+aimsandscope
+analysis
+annexa
+announcement
+announcements
+annualacknowledgementofmanuscriptreviewers
+anotefromtheeditor
+appendices
+appendix
+appendix1
+appendixa
+appendixb
+appointmentsandstaffchanges
+approximation
+apresentacao
+article
+articles
+articlesofsignificantinterestselectedfromthisissuebytheeditors
+associationnews
+ataglance
+atribute
+attention
+authorguidelines
+authorindex
+authorindexforvolume81
+authorreply
+authors
+authorsreply
+authorsresponse
+avantpropos
+award
+awardsappointmentsannouncements
+backcover
+background
+backmatter
+berichtigung
+besprechungen
+bibliografia
+bibliographie
+bibliography
+bigdata
+blankpage
+blood
+boardoftrustees
+booknotes
+booknotices
+bookofabstracts
+bookreview
+bookreviews
+bookreviewsandnotices
+bookreviewssection
+booksreceived
+buchbesprechung
+buchbesprechungen
+bulletin
+calendar
+calendarofevents
+calendarofmeetings
+callforarticles
+callforpapers
+casereport
+casereports
+casestudy
+chairmansopeningremarks
+changes
+chaos
+chapter1
+chapter10
+chapter1introduction
+chapter2
+chapter7
+chapteri
+chapterone
+chapteroneintroduction
+chaptertwo
+chapterx
+citation
+classes
+classified
+classifieds
+closingremarks
+collaborateurs
+comment
+commentaries
+commentary
+commentaryon
+commenton
+comments
+commentto
+committee
+communication
+communications
+communicationstotheeditor
+communiquedepresse
+community
+components
+comptesrendus
+computerscience
+concludingremarks
+conclusion
+conclusions
+conferencereport
+congratulations
+congresscalendar
+conservation
+content
+contents
+context
+continuingeducation
+continuingmedicaleducation
+contributors
+copyright
+copyrightform
+copyrightnotice
+correction
+corrections
+correspondence
+corrigenda
+corrigendum
+councilminutes
+cover
+coverimage
+currentresearch
+curriculumvitae
+danksagung
+dearreaders
+decisionmaking
+dedication
+dedicatoria
+definition
+description
+discussion
+diskussion
+distribution
+documents
+ear
+economics
+editorial
+editorialadvisoryboard
+editorialannouncement
+editorialboard
+editorialcomment
+editorialcomments
+editorialconsultants
+editoriale
+editorialeditorial
+editorialforeword
+editorialinformation
+editorialintroduction
+editorialintroductions
+editorialnote
+editorialnotes
+editorialpreface
+editorials
+editorialsoftwaresurveysection
+editorialstaff
+editorialstatement
+editorinchief
+editors
+editorschoice
+editorscomment
+editorscomments
+editorscorner
+editorscorrespondence
+editorsforeword
+editorsintroduction
+editorsletter
+editorsnote
+editorsnotes
+editorspage
+editorspicks
+editorspreface
+education
+einfuhrung
+einleitung
+electrophoresis
+employment
+endnotes
+entrevista
+entscheidungsverzeichnis
+epilogue
+equipment
+errata
+erratum
+essay
+essays
+executivesummary
+exercises
+expediente
+extendedabstracts
+feature
+features
+fichatecnica
+figure3
+finalexam
+finalreport
+focus
+foreward
+foreword
+forthcomingarticles
+forthcomingevents
+fortherecord
+forum
+frequentlyaskedquestions
+fromtheeditor
+fromtheeditorinchief
+fromtheeditors
+fromtheeditorsdesk
+fromthepresident
+frontmatter
+furtherreadings
+genealogy
+generaldiscussion
+generalinformation
+generalintroduction
+germany
+gettingstarted
+glosario
+glossary
+glossaryofterms
+guesteditorial
+guesteditorsforeword
+guesteditorsintroduction
+guideforauthors
+guidelinesforcontributors
+health
+heartfailure
+highlights
+highlightsfromthisissue
+highlightsofthisissue
+history
+home
+homework
+hypothesis
+iii
+imageofthemonth
+importantnotice
+impressum
+inbrief
+index
+indexofauthors
+indexofauthorsandtitles
+indice
+indicegeneral
+informationforauthors
+informationtoauthors
+inhalt
+inhaltsverzeichnis
+inleiding
+inmemoriam
+inreply
+inresponse
+insidethisissue
+institutenews
+instructionsforauthors
+instructionstoauthors
+interview
+inthestudy
+inthisissue
+introducao
+introduccion
+introduction
+introductionandoverview
+introductiongenerale
+introductiontotheissue
+introductiontothespecialissue
+introductorycomments
+introductoryremarks
+introduzione
+inventions
+invitedcommentary
+issuesandevents
+jobdescription
+journalclub
+journalscan
+keywords
+kurzkommentiert
+languageteaching
+lecture
+letter
+letterfromtheeditor
+letterfromtheeditorinchief
+letterfromtheeditors
+letterfromthepresident
+letters
+letterstotheeditor
+letterstotheeditors
+lettertotheeditor
+lettertotheeditors
+liminaire
+linearalgebra
+linearregression
+links
+listedestableaux
+listofabbreviations
+listofcontributors
+listoffigures
+listofparticipants
+listofpublications
+listofreferees
+listofreviewers
+listoftables
+literacy
+literatur
+literature
+literaturecited
+literaturereview
+literaturrundschau
+literaturverzeichnis
+litteraturverzeichniss
+livresrecus
+lucina
+lungcancer
+magazin
+maintenance
+materials
+materialsafetydatasheet
+materialsandmethods
+medicinalchemistry
+meetingabstracts
+meetingreport
+meetings
+meetingsandconferences
+meetingsofinterest
+membershipapplication
+memoranda
+memorandum
+messagefromgeneralcochairs
+messagefromthechairs
+messagefromtheeditor
+messagefromtheeditorinchief
+messagefromthepresident
+messagefromtheprogramchairs
+messagefromtheprogramcochairs
+metaanalysis
+miscellanea
+miscellaneous
+miscellany
+missionstatement
+motivation
+mrsnews
+name
+newbooks
+newlyelectedmembersofthecollege
+newproducts
+news
+newsandnotes
+newsandreviews
+newsandviews
+newsbriefs
+newsinbrief
+newsnotes
+newsviews
+noii
+note
+notefromtheeditor
+notes
+notesandcomments
+notesandnews
+notesdelecture
+notesforcontributors
+notesoncontributors
+notice
+noticeboard
+notitle
+notitleavailable
+obituaries
+obituary
+online
+openaccess
+openingaddress
+openingremarks
+oralabstracts
+oralpresentations
+organizingcommittee
+originalarticle
+originalarticles
+other
+outline
+overview
+panorama
+papers
+paperstoappearinforthcomingissues
+partone
+personalandmiscellaneous
+perspective
+perspectives
+philosophy
+pictureofthemonth
+place
+pointofview
+positionsavailable
+poster
+posterpresentations
+postscript
+preface
+prefaceandacknowledgements
+prefacetothesecondedition
+preliminarymaterial
+presentacio
+presentacion
+presentation
+presidentialaddress
+presidentsmessage
+presidentsreport
+pressrelease
+print
+printing
+proceedings
+proceedingsofthenationalacademyofsciences
+profile
+programcommittee
+projectmanagement
+prologue
+publication
+publichealth
+publishersnote
+question
+questionsandanswers
+radiology
+readersforum
+recensiones
+recensions
+recentpublications
+redaktorensforord
+referate
+references
+referenciasbibliograficas
+regression
+rehabilitation
+rejoinder
+remerciements
+reply
+replybyauthors
+researchresearchers
+resenas
+resources
+response
+responsetothelettertotheeditor
+results
+resume
+resumen
+resumes
+resumo
+retraction
+review
+reviewarticle
+revieweracknowledgement
+revieweracknowledgement2013
+reviewers
+reviewessay
+reviews
+reviewsanddescriptionsoftablesandbooks
+reviewsofbooks
+rezension
+rezensionen
+safety
+section
+security
+selectedbibliography
+shortcommunication
+shorternotices
+shortnotices
+socialengineering
+sociology
+sommaire
+sommario
+specialreport
+specialsection
+specifications
+spistresci
+subjectindex
+subscriptions
+suggestedreadings
+sumario
+summaries
+summariesofkeyjournalarticles
+summary
+summaryofproceedings
+summer
+sun
+supplementarymaterial
+symposium
+symptom
+synthese
+tabledesmatieres
+tableofcontents
+tableofcontentsandprologue
+technicalreport
+theauthors
+theauthorsreply
+thebasics
+theeditorsdesk
+thefirstauthorreplies
+thelancet
+theoreticalbackground
+thetimes
+theworldbank
+theyearinreview
+thismonthin
+thismonthinthejournal
+timemanagement
+titeleiinhaltsverzeichnis
+title
+titlepage
+titlepagei
+tocorrespondents
+totheeditor
+unitedkingdom
+unitednations
+unitedstates
+upcomingevents
+vorwort
+website
+welcome
+whatshappening
+whatsnew
+workscited
+yourquestionsanswered
+zudiesemheft
+zusammenfassung
diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
new file mode 100644
index 0000000..abf9220
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
@@ -0,0 +1,50 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+class BibjsonScorable extends Scorable {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("bibjson-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .map { BibjsonScorable.bibjsonToMapFeatures(_) }
+ }
+}
+
+object BibjsonScorable {
+ def bibjsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ val doi = Scorable.getString(map, "doi")
+ val sha1 = Scorable.getString(map, "sha")
+ // TODO: year, authors (if available)
+ if (title == null || title.isEmpty) {
+ None
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
new file mode 100644
index 0000000..bb6413f
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -0,0 +1,153 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.hbase.HBasePipeConversions
+
+class CrossrefScorable extends Scorable with HBasePipeConversions {
+ // TODO: Generalize args so there can be multiple Crossref pipes in one job.
+ def getSource(args : Args) : Source = {
+ TextLine(args("crossref-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { CrossrefScorable.keepRecord(_) }
+ .map { CrossrefScorable.jsonToMapFeatures(_) }
+ }
+}
+
+object CrossrefScorable {
+
+ val ContentTypeWhitelist: Set[String] = Set(
+ "book",
+ "book-chapter",
+ "dataset",
+ "dissertation",
+ "journal-article",
+ "letter",
+ "monograph",
+ "posted-content",
+ "pre-print",
+ "proceedings-article",
+ "report",
+ "working-paper")
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) None else Some(titles(0))
+ } else {
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitles = map("subtitle").asInstanceOf[List[String]]
+ if (subtitles.isEmpty || subtitles == null) {
+ None
+ } else {
+ val sub = subtitles(0)
+ if (sub == null || sub.isEmpty) {
+ None
+ } else {
+ Some(sub)
+ }
+ }
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
+ } else {
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "author") {
+ val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): combine given and family names?
+ objArray
+ .filter(e => e contains "family")
+ .map(e => e.get("family").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("created") match {
+ case None => None
+ case Some(created) => {
+ Some(created.asInstanceOf[Map[String,Any]]
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
+ }
+ }
+ }
+
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
+ None
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ }
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => None
+ case Some(title) => makeMapFeatures(
+ title=title,
+ doi=Scorable.getString(map, "DOI"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala b/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala
new file mode 100644
index 0000000..b3734f0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpFileMetaJob.scala
@@ -0,0 +1,36 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Dumps all the info needed to insert a file entity in Fatcat. Useful for
+// joining.
+class DumpFileMetaJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String, String, Long)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "file:mime", "file:size"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size"))
+ .filter { case (_, cdx, mime, size) => cdx != null && mime != null && size != null }
+ .map { case (key, cdx, mime, size) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toString(cdx.copyBytes()),
+ Bytes.toString(mime.copyBytes()),
+ Bytes.toLong(size.copyBytes()))
+ };
+
+ metaPipe.write(TypedTsv[(String,String,String,Long)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala
new file mode 100644
index 0000000..ee2b7c2
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidMetaInsertableJob.scala
@@ -0,0 +1,38 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Dumps the SHA1 key and grobid0:metadata columns, plus file metadata needed
+// to insert into fatcat. Used, eg, as part of long-tail mellon pipeline.
+class DumpGrobidMetaInsertableJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String, String, Long, String)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "file:mime", "file:size", "grobid0:metadata"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size", "metadata"))
+ .filter { case (_, cdx, mime, size, metadata) => cdx != null && mime != null && size != null && metadata != null }
+ .map { case (key, cdx, mime, size, metadata) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toString(cdx.copyBytes()),
+ Bytes.toString(mime.copyBytes()),
+ Bytes.toLong(size.copyBytes()),
+ Bytes.toString(metadata.copyBytes())
+ )
+ };
+
+ metaPipe.write(TypedTsv[(String,String,String,Long,String)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala
new file mode 100644
index 0000000..42b3464
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidStatusCodeJob.scala
@@ -0,0 +1,34 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Dumps status code for each GROBID-processed file. Good for crawl/corpus
+// analytics, if we consider GROBID status a rough "is this a paper" metric.
+class DumpGrobidStatusCodeJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, Long)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("grobid0:status_code"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "status_code"))
+ .filter { case (_, status_code) => status_code != null }
+ .map { case (key, status_code) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toLong(status_code.copyBytes()))
+ };
+
+ metaPipe.write(TypedTsv[(String,Long)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
new file mode 100644
index 0000000..953610d
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpGrobidXmlJob.scala
@@ -0,0 +1,41 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+import scala.util.parsing.json.JSONObject
+
+// Dumps the SHA1 key and grobid0:tei_xml columns, as TSV/JSON (two TSV
+// columns: one is key, second is JSON). Used for partner delivery/sharing
+class DumpGrobidXmlJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val metaPipe : TypedPipe[(String, String)] = HBaseBuilder.build(args("hbase-table"),
+ args("zookeeper-hosts"),
+ List("file:cdx", "grobid0:tei_xml"),
+ SourceMode.SCAN_ALL)
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "tei_xml"))
+ .filter { case (_, cdx, tei_xml) => cdx != null && tei_xml != null }
+ .map { case (key, cdx, tei_xml) =>
+ (Bytes.toString(key.copyBytes()),
+ JSONObject(
+ Map(
+ "pdf_hash" -> Bytes.toString(key.copyBytes()),
+ "cdx_metadata" -> Bytes.toString(cdx.copyBytes()),
+ "tei_xml" -> Bytes.toString(tei_xml.copyBytes())
+ )).toString
+ )
+ };
+
+ metaPipe.write(TypedTsv[(String,String)](args("output")))
+
+}
diff --git a/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala
new file mode 100644
index 0000000..7fd3ce0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala
@@ -0,0 +1,67 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Filters for HBase rows which have not had GROBID run on them, but do have
+// full CDX metadata, and dumps to a TSV for later extraction by the
+// "extraction-ungrobided" job.
+//
+// Does the same horrible join thing that DumpUnGrobidedJob does.
+class DumpUnGrobidedJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val output = args("output")
+
+ val allKeys : TypedPipe[(String,String,String,String)] = DumpUnGrobidedJob.getHBaseKeySource(
+ args("hbase-table"),
+ args("zookeeper-hosts"))
+ .read
+ .fromBytesWritable('key, 'c, 'mime, 'cdx)
+ .toTypedPipe[(String,String,String,String)]('key, 'c, 'mime, 'cdx)
+
+ val existingKeys : TypedPipe[(String,Boolean)] = DumpUnGrobidedJob.getHBaseColSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"))
+ .read
+ .fromBytesWritable('key)
+ .toTypedPipe[String]('key)
+ .map{ key => (key, true) }
+
+ val missingKeys : TypedPipe[(String,String,String,String)] = allKeys
+ .groupBy(_._1)
+ .leftJoin(existingKeys.groupBy(_._1))
+ .toTypedPipe
+ .collect { case (key, ((_, c, mime, cdx), None)) => (key, c, mime, cdx) }
+
+ missingKeys
+ .write(TypedTsv[(String,String,String,String)](output))
+
+}
+
+object DumpUnGrobidedJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseColSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("grobid0:status_code"),
+ SourceMode.SCAN_ALL)
+ }
+
+ def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("f:c", "file:mime", "file:cdx"),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/FatcatScorable.scala b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala
new file mode 100644
index 0000000..2090e84
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala
@@ -0,0 +1,146 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.hbase.HBasePipeConversions
+
+
+class FatcatScorableRight extends Scorable {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("fatcat-release-input-right"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { FatcatScorable.keepRecord(_) }
+ .map { FatcatScorable.jsonToMapFeatures(_) }
+ }
+}
+
+class FatcatScorable extends Scorable with HBasePipeConversions {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("fatcat-release-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { FatcatScorable.keepRecord(_) }
+ .map { FatcatScorable.jsonToMapFeatures(_) }
+ }
+}
+
+object FatcatScorable {
+
+ // Note; removed ReleaseType filtering
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val title = map("title").asInstanceOf[String]
+ if (title == null || title.isEmpty) None else Some(title)
+ } else {
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitle = map("subtitle").asInstanceOf[String]
+ if (subtitle == null || subtitle.isEmpty) {
+ None
+ } else {
+ Some(subtitle)
+ }
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
+ } else {
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "contribs") {
+ val objArray = map("contribs").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): better name stuff... contrib.surname, creator.surname,
+ // or raw_name split to last
+ objArray
+ .filter(e => e contains "raw_name")
+ .map(e => e.get("raw_name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("release_year") match {
+ case None => None
+ case Some(year) => {
+ Some(year.asInstanceOf[Double].toInt)
+ }
+ }
+ }
+
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, fatcat_release: String, fatcat_work : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ // NOTE: not doing any filtering here!
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi, fatcat_release=fatcat_release, fatcat_work=fatcat_work, year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => None
+ case Some(title) => makeMapFeatures(
+ title=title,
+ // TODO: doi=Scorable.getString(map, "doi"),
+ doi=null,
+ fatcat_release=Scorable.getString(map, "ident"),
+ fatcat_work=Scorable.getString(map, "work_id"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
new file mode 100644
index 0000000..f4ed129
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -0,0 +1,83 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class GrobidScorable extends Scorable with HBasePipeConversions {
+ val StatusOK = 200
+
+ def getSource(args : Args) : Source = {
+ // TODO: Generalize args so there can be multiple grobid pipes in one job
+ GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args)
+ .read
+ // Can't just "fromBytesWritable" because we have multiple types
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code"))
+ .filter { case (_, metadata, status_code) => metadata != null && status_code != null }
+ .map { case (key, metadata, status_code) =>
+ (Bytes.toString(key.copyBytes()), Bytes.toString(metadata.copyBytes()), Bytes.toLong(status_code.copyBytes()))
+ }
+ .collect { case (key, json, StatusOK) => (key, json) }
+ .filter { case (key, json) => GrobidScorable.keepRecord(json) }
+ .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
+ }
+}
+
+object GrobidScorable {
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ title != null && title.length <= Scorable.MaxTitleLength
+ } else {
+ false
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "authors") {
+ val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ objArray
+ .filter(e => e contains "name")
+ .map(e => e.get("name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def getHBaseSource(table : String, host : String) : HBaseSource = {
+ HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)
+ }
+
+ def jsonToMapFeatures(key : String, json : String) : Option[MapFeatures] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ val authors: List[String] = mapToAuthorList(map)
+ val title = Scorable.getString(map, "title")
+ ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
+
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
new file mode 100644
index 0000000..3146a6c
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -0,0 +1,59 @@
+
+package sandcrawler
+
+import cascading.flow.FlowDef
+import cascading.pipe.Pipe
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
+
+ val grobidHbaseRows = Stat("hbase-rows-scanned", "hbase-grobid-dump")
+ val filteredGrobidRows = Stat("grobid-rows-filtered", "hbase-grobid-dump")
+ val parsedGrobidRows = Stat("grobid-rows-parsed", "hbase-grobid-dump")
+ val validGrobidRows = Stat("grobid-rows-valid-slug", "hbase-grobid-dump")
+
+ val pipe = GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+ .read
+ // Can't just "fromBytesWritable" because we have multiple types?
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code"))
+ .filter { case (_, metadata, status_code) =>
+ grobidHbaseRows.inc
+ metadata != null && status_code != null
+ }
+ .map { case (key, metadata, status_code) =>
+ (Bytes.toString(key.copyBytes()), Bytes.toString(metadata.copyBytes()), Bytes.toLong(status_code.copyBytes()))
+ }
+ // TODO: Should I combine next two stages for efficiency?
+ .collect { case (key, json, 200) =>
+ filteredGrobidRows.inc
+ (key, json)
+ }
+ .map { entry : (String, String) =>
+ parsedGrobidRows.inc
+ GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
+ }
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => {
+ validGrobidRows.inc
+ entry.get
+ }}
+ .groupBy { case MapFeatures(slug, json) => slug }
+ .map { tuple =>
+ val (slug : String, features : MapFeatures) = tuple
+ (slug, ReduceFeatures(features.json))
+ }
+
+ pipe
+ .map { case (slug, features) =>
+ (slug, features.json)
+ }
+ .write(TypedTsv[(String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala
new file mode 100644
index 0000000..46d2038
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksJob.scala
@@ -0,0 +1,43 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class GroupFatcatWorksJob(args: Args) extends JobBase(args) {
+
+ val fatcatRowCount = Stat("fatcat-rows-filtered", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val fatcatScorable : Scorable = new FatcatScorable()
+ val fatcatPipe : TypedPipe[(String, ReduceFeatures)] = fatcatScorable
+ .getInputPipe(args)
+ .map { r =>
+ fatcatRowCount.inc
+ r
+ }
+
+ val joinedPipe = fatcatPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(fatcatPipe)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ // filter out trivial self-matches (releases are identical)
+ .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
+ }
+ .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ joinedRowCount.inc
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
+ fatcatFeaturesLeft.json,
+ fatcatFeaturesRight.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala
new file mode 100644
index 0000000..ea5e26b
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GroupFatcatWorksSubsetJob.scala
@@ -0,0 +1,52 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class GroupFatcatWorksSubsetJob(args: Args) extends JobBase(args) {
+
+ val fatcatLhsRowCount = Stat("fatcat-rows-filtered-left", "sandcrawler")
+ val fatcatRhsRowCount = Stat("fatcat-rows-filtered-right", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val fatcatScorableLhs : Scorable = new FatcatScorable()
+ val fatcatPipeLhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableLhs
+ .getInputPipe(args)
+ .map { r =>
+ fatcatLhsRowCount.inc
+ r
+ }
+
+ val fatcatScorableRhs : Scorable = new FatcatScorableRight()
+ val fatcatPipeRhs : TypedPipe[(String, ReduceFeatures)] = fatcatScorableRhs
+ .getInputPipe(args)
+ .map { r =>
+ fatcatRhsRowCount.inc
+ r
+ }
+
+ val joinedPipe = fatcatPipeLhs
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(fatcatPipeRhs)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ // filter out trivial self-matches (releases are identical)
+ .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
+ }
+ .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ joinedRowCount.inc
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
+ fatcatFeaturesLeft.json,
+ fatcatFeaturesRight.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala
new file mode 100644
index 0000000..20cc7a1
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/HBaseColCountJob.scala
@@ -0,0 +1,37 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class HBaseColCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val output = args("output")
+
+ HBaseColCountJob.getHBaseSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"),
+ args("column"))
+ .read
+ .debug
+ .groupAll { _.size('count) }
+ .write(Tsv(output))
+}
+
+object HBaseColCountJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List(col),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
index 4c3de33..5c7954a 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -30,7 +30,7 @@ object HBaseRowCountJob {
HBaseBuilder.build(
hbaseTable,
zookeeperHosts,
- List("file:size"),
+ List("f:c"),
SourceMode.SCAN_ALL)
}
}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala
new file mode 100644
index 0000000..4d9880f
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/HBaseStatusCodeCountJob.scala
@@ -0,0 +1,32 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class HBaseStatusCodeCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val source = HBaseCountJob.getHBaseSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"),
+ "grobid0:status_code")
+
+ val statusPipe : TypedPipe[Long] = source
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status_code)
+ .map { case (key, raw_code) => Bytes.toLong(raw_code.copyBytes()) }
+
+ statusPipe.groupBy { identity }
+ .size
+ .debug
+ .write(TypedTsv[(Long,Long)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala
index fd0b4e2..f79d672 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseStatusCountJob.scala
@@ -18,15 +18,15 @@ class HBaseStatusCountJob(args: Args) extends JobBase(args) with HBasePipeConver
val source = HBaseCountJob.getHBaseSource(
args("hbase-table"),
args("zookeeper-hosts"),
- "grobid0:status_code")
+ "grobid0:status")
- val statusPipe : TypedPipe[Long] = source
+ val statusPipe : TypedPipe[String] = source
.read
- .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status_code)
- .map { case (key, raw_code) => Bytes.toLong(raw_code.copyBytes()) }
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable)]('key, 'status)
+ .map { case (key, raw_status) => Bytes.toString(raw_status.copyBytes()) }
statusPipe.groupBy { identity }
.size
.debug
- .write(TypedTsv[(Long,Long)](args("output")))
+ .write(TypedTsv[(String,Long)](args("output")))
}
diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
new file mode 100644
index 0000000..292de75
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
@@ -0,0 +1,30 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class MatchBenchmarkJob(args: Args) extends JobBase(args) {
+ // TODO: Instantiate any subclass of Scorable specified in args.
+ val sc1 : Scorable = new BibjsonScorable()
+ val sc2 : Scorable = new BibjsonScorable()
+ val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson")))
+ val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson")))
+ val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs)
+ val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs)
+
+ pipe1.join(pipe2)
+ .map { entry =>
+ val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(features1, features2),
+ features1.json,
+ features2.json)
+ }
+ //TypedTsv doesn't work over case classes.
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala b/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala
new file mode 100644
index 0000000..cc3bf23
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/MissingColumnDumpJob.scala
@@ -0,0 +1,67 @@
+package sandcrawler
+
+import java.util.Properties
+
+import cascading.property.AppProps
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// This nasty, no-good, horrible Job outputs a list of keys ("sha1:A234...")
+// for which the given "column" does not have a value set.
+// It does this using a self-join because SpyGlass's HBase SCAN support seems
+// to be extremely limited.
+class MissingColumnDumpJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ val output = args("output")
+
+ val allKeys : TypedPipe[String] = MissingColumnDumpJob.getHBaseKeySource(
+ args("hbase-table"),
+ args("zookeeper-hosts"))
+ .read
+ .fromBytesWritable('key)
+ .toTypedPipe[String]('key)
+
+ val existingKeys : TypedPipe[(String,Boolean)] = MissingColumnDumpJob.getHBaseColSource(
+ args("hbase-table"),
+ args("zookeeper-hosts"),
+ args("column"))
+ .read
+ .fromBytesWritable('key)
+ .toTypedPipe[String]('key)
+ .map{ key => (key, true) }
+
+ val missingKeys : TypedPipe[String] = allKeys
+ .groupBy( identity )
+ .leftJoin(existingKeys.groupBy(_._1))
+ .toTypedPipe
+ .collect { case (key, (_, None)) => key }
+
+ missingKeys
+ .write(TypedTsv[String](output))
+
+}
+
+object MissingColumnDumpJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseColSource(hbaseTable: String, zookeeperHosts: String, col: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List(col),
+ SourceMode.SCAN_ALL)
+ }
+
+ def getHBaseKeySource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("f:c"),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
new file mode 100644
index 0000000..d9c38e8
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -0,0 +1,96 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+case class MapFeatures(slug : String, json : String)
+case class ReduceFeatures(json : String)
+case class ReduceOutput(val slug : String, score : Int, json1 : String, json2 : String)
+
+abstract class Scorable {
+ def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] = {
+ val validFeatures : TypedPipe[MapFeatures] = getFeaturesPipe(args)
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => entry.get }
+
+ validFeatures
+ .groupBy { case MapFeatures(slug, json) => slug }
+ .map { tuple =>
+ val (slug : String, features : MapFeatures) = tuple
+ (slug, ReduceFeatures(features.json))
+ }
+ }
+
+ // abstract methods
+ def getSource(args : Args) : Source
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]]
+}
+
+object Scorable {
+ val MaxTitleLength = 1023
+
+ def jsonToMap(json : String) : Option[Map[String, Any]] = {
+ // https://stackoverflow.com/a/32717262/631051
+ val jsonObject = JSON.parseFull(json)
+ if (jsonObject == None) {
+ None
+ } else {
+ Some(jsonObject.get.asInstanceOf[Map[String, Any]])
+ }
+ }
+
+ def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
+ optionalMap match {
+ case None => None
+ case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None
+ }
+ }
+
+ // Caller is responsible for ensuring that key is a String in map.
+ // TODO: Add and handle ClassCastException
+ def getString(map : Map[String, Any], key : String) : String = {
+ assert(map contains key)
+ map(key).asInstanceOf[String]
+ }
+
+ val MaxScore = 1000
+
+ def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+
+ (
+ getStringOption(json1, "fatcat_release") != None &&
+ getStringOption(json2, "fatcat_release") != None &&
+ getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") &&
+ (getStringOption(json1, "fatcat_work") match {
+ case None => false
+ case Some(work1) => getStringOption(json2, "fatcat_work") match {
+ case None => false
+ // this last check ensures we don't double-match
+ case Some(work2) => work1 > work2
+ }
+ })
+ )
+ }
+
+ def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+ getStringOption(json1, "title") match {
+ case None => 0
+ case Some(title1) => {
+ getStringOption(json2, "title") match {
+ case None => 0
+ case Some(title2) =>
+ (StringUtilities.similarity(title1.toLowerCase, title2.toLowerCase) * MaxScore).toInt
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
new file mode 100644
index 0000000..93cd78d
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -0,0 +1,63 @@
+package sandcrawler
+
+import java.io.InputStream
+
+import scala.io.Source
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+object ScorableFeatures {
+ // TODO: Add exception handling.
+ val fileStream : InputStream = getClass.getResourceAsStream("/slug-denylist.txt")
+ val SlugDenylist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
+ fileStream.close
+ val MinSlugLength = 8
+
+ // Static factory method
+ def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = {
+ new ScorableFeatures(
+ title=if (title == null) "" else title,
+ authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
+ year=year,
+ doi=if (doi == null) "" else doi,
+ fatcat_release=if (fatcat_release == null) "" else fatcat_release,
+ fatcat_work=if (fatcat_work == null) "" else fatcat_work,
+ sha1=if (sha1 == null) "" else sha1)
+ }
+}
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures). Create with above static factory method.
+class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") {
+
+ def toMap() : Map[String, Any] =
+ Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1)
+
+ override def toString() : String = {
+ JSONObject(toMap).toString
+ }
+
+ def toSlug() : Option[String] = {
+ if (title == null) {
+ None
+ } else {
+ val unaccented = StringUtilities.removeAccents(title)
+ // Remove punctuation
+ val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
+ if (slug.isEmpty
+ || slug == null
+ || (ScorableFeatures.SlugDenylist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) {
+ None
+ } else {
+ Some(slug)
+ }
+ }
+ }
+
+ def toMapFeatures : Option[MapFeatures] =
+ toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, toString))
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala b/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala
new file mode 100644
index 0000000..58007fa
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScoreInsertable.scala
@@ -0,0 +1,86 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.pipe.Pipe
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class ScoreInsertableJob(args: Args) extends JobBase(args) {
+
+ val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler")
+ val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler")
+ val cdxRowCount = Stat("cdx-rows", "sandcrawler")
+ val scoredRowCount = Stat("scored-rows", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val grobidScorable : Scorable = new GrobidScorable()
+ val crossrefScorable : Scorable = new CrossrefScorable()
+
+ val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable
+ .getInputPipe(args)
+ .map { r =>
+ grobidRowCount.inc
+ r
+ }
+ val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable
+ .getInputPipe(args)
+ .map { r =>
+ crossrefRowCount.inc
+ r
+ }
+ val cdxPipe : TypedPipe[(String, String, String, Long)] = ScoreInsertableJob.getHBaseCdxSource(args("hbase-table"), args("zookeeper-hosts"))
+ .read
+ .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "cdx", "mime", "size"))
+ .filter { case (_, cdx, mime, size) => cdx != null && mime != null && size != null }
+ .map { case (key, cdx, mime, size) =>
+ (Bytes.toString(key.copyBytes()),
+ Bytes.toString(cdx.copyBytes()),
+ Bytes.toString(mime.copyBytes()),
+ Bytes.toLong(size.copyBytes()))
+ }
+ .map { r =>
+ cdxRowCount.inc
+ r
+ }
+
+ val scoredPipe = grobidPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(crossrefPipe)
+ .map { case (slug, (grobidFeatures, crossrefFeatures)) =>
+ scoredRowCount.inc
+ //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry
+ // Not ever Empty, I promise
+ val key = Scorable.getStringOption(Scorable.jsonToMap(grobidFeatures.json), "sha1").orNull
+ (key, new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(grobidFeatures, crossrefFeatures),
+ grobidFeatures.json,
+ crossrefFeatures.json))
+ }
+ .map { case (key, entry) => (key, entry.slug, entry.score, entry.json1, entry.json2) }
+ .groupBy { case (key, _, _, _, _) => key }
+
+ // TypedTsv doesn't work over case classes.
+ val joinedPipe = scoredPipe
+ .join(cdxPipe.groupBy { case (key, _, _, _) => key })
+ .map { case (key, ((_, slug, score, left, right), (_, cdx, mime, size))) => (key, slug, score, left, right, cdx, mime, size) }
+ .write(TypedTsv[(String, String, Int, String, String, String, String, Long)](args("output")))
+}
+
+object ScoreInsertableJob {
+
+ // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
+ def getHBaseCdxSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbaseTable,
+ zookeeperHosts,
+ List("file:cdx", "file:mime", "file:size"),
+ SourceMode.SCAN_ALL)
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
new file mode 100644
index 0000000..ccb9b76
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -0,0 +1,48 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class ScoreJob(args: Args) extends JobBase(args) {
+
+ val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler")
+ val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val grobidScorable : Scorable = new GrobidScorable()
+ val crossrefScorable : Scorable = new CrossrefScorable()
+ val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable
+ .getInputPipe(args)
+ .map { r =>
+ grobidRowCount.inc
+ r
+ }
+ val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable
+ .getInputPipe(args)
+ .map { r =>
+ crossrefRowCount.inc
+ r
+ }
+
+ val joinedPipe = grobidPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(crossrefPipe)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ .map { case (slug, (grobidFeatures, crossrefFeatures)) =>
+ joinedRowCount.inc
+ //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(grobidFeatures, crossrefFeatures),
+ grobidFeatures.json,
+ crossrefFeatures.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
new file mode 100644
index 0000000..9150ced
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -0,0 +1,76 @@
+package sandcrawler
+
+import java.text.Normalizer
+import java.util.regex.Pattern
+
+object StringUtilities {
+ // bnewbold: I propose that we:
+ // 1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+ // 2. strip accents
+ // 3. "lower-case" (unicode-aware)
+ // 4. do any final custom/manual mappings
+ //
+ // We should check (test) that null bytes are handled, in addition to other
+ // more obvious characters
+
+ // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
+ def removeAccents(s : String) : String = {
+ val replacements = Map(
+ '\u0141' -> 'L',
+ '\u0142' -> 'l', // Letter ell
+ '\u00d8' -> 'O',
+ '\u00f8' -> 'o'
+ )
+ val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD))
+ for (i <- 0 to sb.length - 1) {
+ for (key <- replacements.keys) {
+ if (sb(i) == key) {
+ sb.deleteCharAt(i);
+ sb.insert(i, replacements(key))
+ }
+ }
+ }
+ val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
+ pattern.matcher(sb).replaceAll("")
+ }
+
+ // Source: https://stackoverflow.com/a/30076541/631051
+ def removePunctuation(s: String) : String = {
+ s.replaceAll("""[\p{Punct}’·“â€â€˜â€™â€œâ€Â«Â»ã€Œã€Â¿â€“±§ʿ]""", "")
+ }
+
+ // Adapted from: https://stackoverflow.com/a/16018452/631051
+ def similarity(s1a : String, s2a : String) : Double = {
+ val (s1, s2) = (removeAccents(removePunctuation(s1a)),
+ removeAccents(removePunctuation(s2a)))
+ val longer : String = if (s1.length > s2.length) s1 else s2
+ val shorter : String = if (s1.length > s2.length) s2 else s1
+ if (longer.length == 0) {
+ // Both strings are empty.
+ 1
+ } else {
+ (longer.length - stringDistance(longer, shorter)) / longer.length.toDouble
+ }
+ }
+
+ // Source: https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ def stringDistance(s1: String, s2: String): Int = {
+ val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
+ def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c)
+ def sd(s1: List[Char], s2: List[Char]): Int = {
+ if (!memo.contains((s1, s2))) {
+ memo((s1,s2)) = (s1, s2) match {
+ case (_, Nil) => s1.length
+ case (Nil, _) => s2.length
+ case (c1::t1, c2::t2) =>
+ min( sd(t1,s2) + 1, sd(s1,t2) + 1,
+ sd(t1,t2) + (if (c1==c2) 0 else 1) )
+ }
+ }
+ memo((s1,s2))
+ }
+
+ sd( s1.toList, s2.toList )
+ }
+}
+
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
new file mode 100644
index 0000000..8302b8f
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -0,0 +1,172 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class CrossrefScorableTest extends FlatSpec with Matchers {
+ // scalastyle:off
+ val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
+ "delay-in-days" : 0, "content-version" : "tdm" }],
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "DOI" : "<<DOI>>",
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
+ "title" : [ "<<TITLE>>" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
+ { "URL" :
+ "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+ "content-type" : "text/plain",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+""".replace("<<DOI>>", "10.123/aBc")
+ // scalastyle:on
+ val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")
+ val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
+ val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+ val MalformedCrossrefString = CrossrefString.replace("}", "")
+ val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author")
+ val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
+
+ // Unit tests
+ "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None)
+ }
+
+ it should "handle missing title" in {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None)
+ }
+
+ it should "handle null title" in {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None)
+ }
+
+ it should "handle empty title" in {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None)
+ }
+
+ it should "handle subtitle" in {
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+ }
+ }
+
+ it should "handle empty subtitle" in {
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle null subtitle" in {
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle missing authors" in {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None)
+ }
+
+ it should "handle valid input" in {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match {
+ case None => fail()
+ case Some(result) => {
+ result.slug shouldBe "sometitle"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Some Title"
+ map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 2002
+ }
+ }
+ }
+ }
+ }
+
+ "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "handle content types" in {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None)
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None)
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala
new file mode 100644
index 0000000..8dda5c8
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala
@@ -0,0 +1,72 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
+import scala._
+
+@RunWith(classOf[JUnitRunner])
+class DumpUnGrobidedJobTest extends FunSpec with TupleConversions {
+
+ val output = "/tmp/testOutput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+ val log = LoggerFactory.getLogger(this.getClass.getName)
+
+ val statusCode: Long = 200
+ val statusBytes = Bytes.toBytes(statusCode)
+
+ val sampleDataGrobid : List[List[Array[Byte]]] = List(
+ ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusBytes),
+ ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusBytes),
+ ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusBytes),
+ ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusBytes),
+ ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusBytes),
+ ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusBytes))
+ .map(pair => List(Bytes.toBytes(pair._1), pair._2))
+
+ val sampleDataFile : List[List[Array[Byte]]] = List(
+ ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+ ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""))
+ .map(pair => List(Bytes.toBytes(pair._1),
+ Bytes.toBytes(pair._2),
+ Bytes.toBytes(pair._3),
+ Bytes.toBytes(pair._4)))
+
+ JobTest("sandcrawler.DumpUnGrobidedJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("debug", "true")
+ .source[Tuple](DumpUnGrobidedJob.getHBaseColSource(testTable, testHost),
+ sampleDataGrobid.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source[Tuple](DumpUnGrobidedJob.getHBaseKeySource(testTable, testHost),
+ sampleDataFile.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .sink[Tuple](TypedTsv[(String,String,String,String)](output)) {
+ outputBuffer =>
+ it("should return correct-length list.") {
+ assert(outputBuffer.size === 2)
+ }
+ }
+ .run
+ .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
new file mode 100644
index 0000000..823e14a
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
@@ -0,0 +1,160 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class FatcatScorableTest extends FlatSpec with Matchers {
+ // scalastyle:off
+ val FatcatString =
+"""
+{
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "W Gaier",
+ "surname": "Gaier",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Elsevier BV",
+ "pages": "186-187",
+ "ext_ids": {
+ "doi": "<<DOI>>"
+ },
+ "release_year": 1996,
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "3nccslsn5jez3ixrp5skjyjxu4",
+ "title": "<<TITLE>>",
+ "state": "active",
+ "ident": "pnri57u66ffytigdmyybbmouni",
+ "work_id": "tdmqnfzm2nggrhfwzasyegvpyu",
+ "revision": "e50bd04e-d0d4-4ee7-b7a4-6b4f079de154",
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "0987-7983(96)87729-2"
+ ],
+ "type": "journal-article"
+ }
+ }
+}
+""".replace("<<DOI>>", "10.123/aBc")
+ // scalastyle:on
+ val FatcatStringWithGoodTitle = FatcatString.replace("<<TITLE>>", "Some Title")
+ val FatcatStringWithMaximumTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val FatcatStringWithExcessiveTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val FatcatStringWithNullTitle = FatcatString.replace("\"<<TITLE>>\"", "null")
+ val FatcatStringWithEmptyTitle = FatcatString.replace("<<TITLE>>", "")
+ val FatcatStringWithoutTitle = FatcatString.replace("title", "nottitle")
+ val MalformedFatcatString = FatcatString.replace("}", "")
+ val FatcatStringWithNoAuthors = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("contribs", "no-contribs")
+ //val FatcatStringWrongType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ //val FatcatStringNoType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
+
+ // Unit tests
+ "FatcatScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ FatcatScorable.jsonToMapFeatures(MalformedFatcatString) should be (None)
+ }
+
+ it should "handle missing title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithoutTitle) should be (None)
+ }
+
+ it should "handle null title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithNullTitle) should be (None)
+ }
+
+ it should "handle empty title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithEmptyTitle) should be (None)
+ }
+
+ it should "handle subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": "just right!", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article","contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+ }
+ }
+
+ it should "handle empty subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": "", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle null subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": null, "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle missing authors" in {
+ // TODO: not actually removing these
+ //FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors) should be (None)
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors)
+ }
+
+ it should "handle valid input" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithGoodTitle) match {
+ case None => fail()
+ case Some(result) => {
+ result.slug shouldBe "sometitle"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Some Title"
+ //map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ map("fatcat_release").asInstanceOf[String] shouldBe "pnri57u66ffytigdmyybbmouni"
+ map("fatcat_work").asInstanceOf[String] shouldBe "tdmqnfzm2nggrhfwzasyegvpyu"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("W Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 1996
+ }
+ }
+ }
+ }
+ }
+
+ "FatcatScorable.keepRecord()" should "return true for valid JSON with title" in {
+ FatcatScorable.keepRecord(FatcatStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ FatcatScorable.keepRecord(FatcatStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ FatcatScorable.keepRecord(FatcatStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ FatcatScorable.keepRecord(FatcatStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+ }
+
+}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
new file mode 100644
index 0000000..bf9343b
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
@@ -0,0 +1,124 @@
+
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
+ //scalastyle:off
+ val JsonString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ // scalastyle:on
+ val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+ val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+ val MalformedJsonString = JsonString.replace("}", "")
+
+ // Pipeline tests
+ val output = "/tmp/testOutput"
+ val input = "/tmp/testInput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+ val Sha1Strings : List[String] = List(
+ "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", // good
+ "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", // good
+ "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", // good
+ "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", // bad status
+ "sha1:93187A85273589347598473894839443", // malformed
+ "sha1:024937534094897039547e9824382943") // bad status
+
+ val JsonStrings : List[String] = List(
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
+ JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+ JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+ // This will have bad status.
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
+ MalformedJsonString,
+ // This will have bad status.
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG")
+ )
+
+ // bnewbold: status codes aren't strings, they are uint64
+ val Ok : Long = 200
+ val Bad : Long = 400
+ val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
+
+ val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+ .zipped
+ .toList
+ .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+ .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+ // scalastyle:off null
+ // Add example of lines without GROBID data
+ val SampleData = SampleDataHead :+ new Tuple(
+ new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+ // scalastyle:on null
+
+ JobTest("sandcrawler.GrobidScorableDumpJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("debug", "true")
+ .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+ .sink[(String, String)](TypedTsv[(String, String)](output)) {
+ outputBuffer =>
+ "The pipeline" should "return correct-length list" in {
+ outputBuffer should have length 3
+ }
+ }
+ .run
+ .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
new file mode 100644
index 0000000..b395a64
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -0,0 +1,122 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableTest extends FlatSpec with Matchers {
+ val GrobidString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithMaximumTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")
+ val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+ val MalformedGrobidString = GrobidString.replace("}", "")
+ val Key = "Dummy Key"
+
+ // Unit tests
+
+ "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None)
+ }
+
+ it should "handle null title" in {
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None)
+ }
+
+ it should "handle missing title" in {
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None)
+ }
+
+ it should "handle valid input" in {
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match {
+ case None => fail()
+ case Some(result) => {
+ result.slug shouldBe "dummyexamplefile"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map should contain key "title"
+ map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
+ }
+ }
+ }
+ }
+ }
+
+ "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in {
+ GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ GrobidScorable.keepRecord(GrobidStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
index 603a4c7..c61cb22 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
@@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers {
fields should have length 0
}
+ //scalastyle:off no.whitespace.before.left.bracket
it should "throw IllegalArgumentException on malformed input" in {
a [IllegalArgumentException] should be thrownBy {
HBaseBuilder.parseColSpecs(List("file_size"))
diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
index fde2290..d6d283f 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
@@ -1,15 +1,18 @@
package sandcrawler
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.junit.runner.RunWith
import org.scalatest.FunSpec
import org.scalatest.junit.JUnitRunner
import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
import scala._
@RunWith(classOf[JUnitRunner])
diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
index 3424a36..c4ca5aa 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
@@ -1,15 +1,18 @@
package sandcrawler
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.junit.runner.RunWith
import org.scalatest.FunSpec
import org.scalatest.junit.JUnitRunner
import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
import scala._
/**
@@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
outputBuffer =>
it("should return the test data provided.") {
- println("outputBuffer.size => " + outputBuffer.size)
assert(outputBuffer.size === 1)
}
it("should return the correct count") {
- println("raw output => " + outputBuffer)
assert(outputBuffer(0).getObject(0) === 8)
}
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala
new file mode 100644
index 0000000..d2cf9de
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala
@@ -0,0 +1,71 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
+import scala._
+
+@RunWith(classOf[JUnitRunner])
+class HBaseStatusCodeCountTest extends FunSpec with TupleConversions {
+
+ val output = "/tmp/testOutput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+ val log = LoggerFactory.getLogger(this.getClass.getName)
+
+ val statusType1 : Long = 200
+ val statusType2 : Long = 404
+ val statusType1Bytes = Bytes.toBytes(statusType1)
+ val statusType2Bytes = Bytes.toBytes(statusType2)
+
+ // TODO(bnewbold): now to express a null (empty value) in this list?
+ val sampleData : List[List[Array[Byte]]] = List(
+ ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes),
+ ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes),
+ ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes),
+ ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes),
+ ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes),
+ ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes),
+ ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes),
+ ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes))
+ .map(pair => List(Bytes.toBytes(pair._1), pair._2))
+
+ val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes)
+ val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes)
+
+ JobTest("sandcrawler.HBaseStatusCodeCountJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("debug", "true")
+ .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status_code"),
+ sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .sink[Tuple](TypedTsv[(Long, Long)](output)) {
+ outputBuffer =>
+ it("should return a correct number of elements.") {
+ assert(outputBuffer.size === 2)
+ }
+
+ // Convert List[Tuple] to Map[Long, Long].
+ val counts = outputBuffer.map(t => (t.getLong(0), t.getLong(1))).toMap
+ it("should have the appropriate number of each status type") {
+ assert(counts(statusType1) == statusType1Count)
+ assert(counts(statusType2) == statusType2Count)
+ }
+ }
+ .run
+ .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
index d7689cd..7e91af3 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
@@ -1,15 +1,19 @@
package sandcrawler
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.junit.runner.RunWith
import org.scalatest.FunSpec
import org.scalatest.junit.JUnitRunner
import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
import scala._
@RunWith(classOf[JUnitRunner])
@@ -20,21 +24,20 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
val log = LoggerFactory.getLogger(this.getClass.getName)
- val statusType1 : Long = 200
- val statusType2 : Long = 404
- val statusType1Bytes = Bytes.toBytes(statusType1)
- val statusType2Bytes = Bytes.toBytes(statusType2)
-
- val sampleData = List(
- List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), statusType1Bytes),
- List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), statusType1Bytes),
- List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), statusType2Bytes),
- List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), statusType2Bytes),
- List(Bytes.toBytes("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ"), statusType2Bytes),
- List(Bytes.toBytes("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6"), statusType2Bytes),
- List(Bytes.toBytes("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ"), statusType1Bytes),
- List(Bytes.toBytes("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT"), statusType2Bytes)
- )
+ val statusType1Bytes = Bytes.toBytes("""{"status": "success"}""")
+ val statusType2Bytes = Bytes.toBytes("""{"status": "partial"}""")
+
+ // TODO(bnewbold): now to express a null (empty value) in this list?
+ val sampleData : List[List[Array[Byte]]] = List(
+ ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes),
+ ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes),
+ ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes),
+ ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes),
+ ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes),
+ ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes),
+ ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes),
+ ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes))
+ .map(pair => List(Bytes.toBytes(pair._1), pair._2))
val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes)
val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes)
@@ -46,20 +49,13 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
.arg("hbase-table", testTable)
.arg("zookeeper-hosts", testHost)
.arg("debug", "true")
- .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status_code"),
+ .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status"),
sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
- .sink[Tuple](TypedTsv[(Long, Long)](output)) {
+ .sink[Tuple](TypedTsv[(String, Long)](output)) {
outputBuffer =>
it("should return a 2-element list.") {
assert(outputBuffer.size === 2)
}
-
- // Convert List[Tuple] to Map[Long, Long].
- val counts = outputBuffer.map(t => (t.getLong(0), t.getLong(1))).toMap
- it("should have the appropriate number of each status type") {
- assert(counts(statusType1) == statusType1Count)
- assert(counts(statusType2) == statusType2Count)
- }
}
.run
.finish
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
new file mode 100644
index 0000000..c847296
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -0,0 +1,64 @@
+package sandcrawler
+
+import java.io.InputStream
+
+import scala.io.Source
+
+import org.scalatest._
+
+// scalastyle:off null
+class ScorableFeaturesTest extends FlatSpec with Matchers {
+ "toMapFeatures()" should "work with gnarly inputs" in {
+ ScorableFeatures.create(title = null).toMapFeatures
+ ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
+ }
+
+ private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug
+
+ "mapToSlug()" should "extract the parts of titles before a colon" in {
+ titleToSlug("HELLO:there") shouldBe Some("hellothere")
+ }
+
+ it should "extract an entire colon-less string" in {
+ titleToSlug("hello THERE") shouldBe Some("hellothere")
+ }
+
+ it should "return Scorable.NoSlug if given empty string" in {
+ titleToSlug("") shouldBe (None)
+ }
+
+ it should "return Scorable.NoSlug if given null" in {
+ titleToSlug(null) shouldBe (None)
+ }
+
+ it should "strip punctuation" in {
+ titleToSlug("HELLO!:the:re") shouldBe Some("hellothere")
+ titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")
+ titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands")
+ titleToSlug(":;\"\'") shouldBe (None)
+ }
+
+ it should "filter stub titles" in {
+ titleToSlug("abstract") shouldBe (None)
+ titleToSlug("title!") shouldBe (None)
+ titleToSlug("a real title which is not on denylist") shouldBe Some("arealtitlewhichisnotondenylist")
+ }
+
+ it should "strip special characters" in {
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“â€â€˜â€™â€œâ€Â«Â»ã€Œã€Â¿â€“±§ʿ") shouldBe (None)
+ // TODO: titleToSlug("©™₨№…") shouldBe (None)
+ // TODO: titleToSlug("πµΣσ") shouldBe (None)
+ }
+
+ it should "remove whitespace" in {
+ titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz")
+ titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi")
+ titleToSlug("\n \t \r ") shouldBe (None)
+ }
+
+ it should "skip very short slugs" in {
+ titleToSlug("short") shouldBe (None)
+ titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
new file mode 100644
index 0000000..2094543
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -0,0 +1,81 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScorableTest extends FlatSpec with Matchers {
+ val JsonString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ "jsonToMap()" should "return a map, given a legal JSON string" in {
+ Scorable.jsonToMap(JsonString) should not be (None)
+ }
+
+ it should "return None, given illegal JSON" in {
+ Scorable.jsonToMap("illegal{,json{{") should be (None)
+ }
+
+ "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+ val score = Scorable.computeSimilarity(
+ new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+ score shouldBe Scorable.MaxScore
+ }
+
+ "computeOutput()" should "be case-insensitive" in {
+ val left = JsonString.replace("<<TITLE>>", "A TITLE UPPER CASE")
+ val right = JsonString.replace("<<TITLE>>", "a title upper case")
+ val score = Scorable.computeSimilarity(
+ new ReduceFeatures(left), new ReduceFeatures(right))
+ score shouldBe Scorable.MaxScore
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala
new file mode 100644
index 0000000..5393f10
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala
@@ -0,0 +1,262 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScoreInsertableJobTest extends FlatSpec with Matchers {
+ //scalastyle:off
+ val JsonString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ // scalastyle:on
+ val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+ val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+ val MalformedJsonString = JsonString.replace("}", "")
+
+ // scalastyle:off
+ val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
+ "delay-in-days" : 0, "content-version" : "tdm" }],
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "DOI" : "<<DOI>>",
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
+ "title" : [ "<<TITLE>>" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
+ { "URL" :
+ "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+ "content-type" : "text/plain",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+ // scalastyle:on
+ val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string
+ val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
+ val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+ val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+ val MalformedCrossrefString = CrossrefString.replace("}", "")
+ val CrossrefStrings = List(
+ CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
+ CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
+ CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+ CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
+
+ // Pipeline tests
+ val output = "/tmp/testOutput"
+ val input = "/tmp/testInput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+ val Sha1Strings : List[String] = List(
+ "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",
+ "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",
+ "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
+ "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
+ "sha1:93187A85273589347598473894839443",
+ "sha1:024937534094897039547e9824382943",
+ "sha1:93229759932857982837892347893892",
+ "sha1:83229759932857982837892347893892")
+
+ val JsonStrings : List[String] = List(
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+ JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+ JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+ // This will have bad status.
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+ MalformedJsonString,
+ // This will have bad status.
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+ // These are in both sources but have bad titles
+ JsonString.replace("<<TITLE>>", TooLongOfTitle),
+ JsonString.replace("<<TITLE>>", TooShortOfTitle)
+ )
+
+ // bnewbold: status codes aren't strings, they are uint64
+ val Ok : Long = 200
+ val Bad : Long = 400
+ val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
+
+ val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+ .zipped
+ .toList
+ .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+ .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+ // scalastyle:off null
+ // Add example of lines without GROBID data
+ // scalastyle:off null
+ val SampleData = SampleDataHead :+ new Tuple(
+ new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+ // scalastyle:on null
+
+ val CdxList: List[String] = List("{}", "{}", "{}", "{}", "{}", "{}", "{}", "{}" )
+ val MimeList: List[String] = List("application/pdf", "application/pdf", "application/pdf",
+ "application/pdf", "application/pdf", "application/pdf", "application/pdf",
+ "application/pdf")
+ val SizeList: List[Long] = List(1,2,3,4,5,6,7,8)
+
+ // Can zip 3 lists, but not 4... so we recursively zip
+ val SampleCdxData : List[Tuple] = ((Sha1Strings, CdxList).zipped.toList, (MimeList, SizeList).zipped.toList)
+ .zipped
+ .toList
+ .map { case ((sha: String, cdx: String), (mime: String, size: Long)) => List(Bytes.toBytes(sha), Bytes.toBytes(cdx), Bytes.toBytes(mime), Bytes.toBytes(size)) }
+ .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+ JobTest("sandcrawler.ScoreInsertableJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("crossref-input", input)
+ .arg("debug", "true")
+ .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+ .source[Tuple](ScoreInsertableJob.getHBaseCdxSource(testTable, testHost), SampleCdxData)
+ .source(TextLine(input), List(
+ 0 -> CrossrefStrings(0),
+ 1 -> CrossrefStrings(1),
+ 2 -> CrossrefStrings(2),
+ 3 -> CrossrefStrings(3),
+ 4 -> CrossrefStrings(4),
+ 4 -> CrossrefStrings(5)))
+ .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
+ .sink[(String, String, Int, String, String, String, String, Long)](TypedTsv[(String, String, Int, String, String, String, String, Long)](output)) {
+ // Grobid titles and slugs (in parentheses):
+ // Title 1 (title1)
+ // Title 2: TNG (title2tng)
+ // Title 3: The Sequel (title3thesequel)
+ // <too long of a title>
+ // <too short of a title>
+ // crossref titles and slugs (in parentheses):
+ // Title 2: TNG (title2tng)
+ // Title 1: TNG 2A (title1tng2a)
+ // Title 1: TNG 3 (title1tng3)
+ // Title 2: Rebooted (title2rebooted)
+ // <too long of a title>
+ // <too short of a title>
+ // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
+ outputBuffer =>
+ "The pipeline" should "return a 1-element list" in {
+ outputBuffer should have length 1
+ }
+
+ it should "has right # of entries with each slug" in {
+ val slugs = outputBuffer.map(_._2)
+ val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
+ // XXX: countMap("title1") shouldBe 3
+ countMap("title2tng") shouldBe 1
+ }
+
+ def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
+ val mfg : Option[MapFeatures] = GrobidScorable.jsonToMapFeatures(
+ Sha1Strings(grobidIndex),
+ JsonStrings(grobidIndex))
+ val mfc : Option[MapFeatures] = CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex))
+ if (mfg.isEmpty || mfc.isEmpty) {
+ fail()
+ } else {
+ val score = Scorable.computeSimilarity(
+ ReduceFeatures(mfg.get.json),
+ ReduceFeatures(mfc.get.json))
+ (slug, score, mfg.get.json, mfc.get.json)
+ }
+ }
+
+ it should "have right output values" in {
+ //outputBuffer.exists(_ == bundle("title1", 0, 0))
+ //outputBuffer.exists(_ == bundle("title1", 0, 2))
+ //outputBuffer.exists(_ == bundle("title1", 0, 1))
+ outputBuffer.exists(_ == bundle("title2tng", 1, 3))
+ }
+ }
+ .run
+ .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
new file mode 100644
index 0000000..fbc0ee5
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -0,0 +1,248 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScoreJobTest extends FlatSpec with Matchers {
+ //scalastyle:off
+ val JsonString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ // scalastyle:on
+ val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+ val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+ val MalformedJsonString = JsonString.replace("}", "")
+
+ // scalastyle:off
+ val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
+ "delay-in-days" : 0, "content-version" : "tdm" }],
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "DOI" : "<<DOI>>",
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
+ "title" : [ "<<TITLE>>" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
+ { "URL" :
+ "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+ "content-type" : "text/plain",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+ // scalastyle:on
+ val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string
+ val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
+ val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+ val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+ val MalformedCrossrefString = CrossrefString.replace("}", "")
+ val CrossrefStrings = List(
+ CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
+ CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
+ CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+ CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
+
+ // Pipeline tests
+ val output = "/tmp/testOutput"
+ val input = "/tmp/testInput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+ val Sha1Strings : List[String] = List(
+ "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",
+ "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",
+ "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
+ "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
+ "sha1:93187A85273589347598473894839443",
+ "sha1:024937534094897039547e9824382943",
+ "sha1:93229759932857982837892347893892",
+ "sha1:83229759932857982837892347893892")
+
+ val JsonStrings : List[String] = List(
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+ JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+ JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+ // This will have bad status.
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+ MalformedJsonString,
+ // This will have bad status.
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+ // These are in both sources but have bad titles
+ JsonString.replace("<<TITLE>>", TooLongOfTitle),
+ JsonString.replace("<<TITLE>>", TooShortOfTitle)
+ )
+
+ // bnewbold: status codes aren't strings, they are uint64
+ val Ok : Long = 200
+ val Bad : Long = 400
+ val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
+
+ val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+ .zipped
+ .toList
+ .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+ .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+ // scalastyle:off null
+ // Add example of lines without GROBID data
+ // scalastyle:off null
+ val SampleData = SampleDataHead :+ new Tuple(
+ new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+ // scalastyle:on null
+
+ JobTest("sandcrawler.ScoreJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("crossref-input", input)
+ .arg("debug", "true")
+ .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+ .source(TextLine(input), List(
+ 0 -> CrossrefStrings(0),
+ 1 -> CrossrefStrings(1),
+ 2 -> CrossrefStrings(2),
+ 3 -> CrossrefStrings(3),
+ 4 -> CrossrefStrings(4),
+ 4 -> CrossrefStrings(5)))
+ .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
+ .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
+ // Grobid titles and slugs (in parentheses):
+ // Title 1 (title1)
+ // Title 2: TNG (title2tng)
+ // Title 3: The Sequel (title3thesequel)
+ // <too long of a title>
+ // <too short of a title>
+ // crossref titles and slugs (in parentheses):
+ // Title 2: TNG (title2tng)
+ // Title 1: TNG 2A (title1tng2a)
+ // Title 1: TNG 3 (title1tng3)
+ // Title 2: Rebooted (title2rebooted)
+ // <too long of a title>
+ // <too short of a title>
+ // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
+ outputBuffer =>
+ "The pipeline" should "return a 1-element list" in {
+ outputBuffer should have length 1
+ }
+
+ it should "has right # of entries with each slug" in {
+ val slugs = outputBuffer.map(_._1)
+ val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
+ // XXX: countMap("title1") shouldBe 3
+ countMap("title2tng") shouldBe 1
+ }
+
+ def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
+ val mfg : Option[MapFeatures] = GrobidScorable.jsonToMapFeatures(
+ Sha1Strings(grobidIndex),
+ JsonStrings(grobidIndex))
+ val mfc : Option[MapFeatures] = CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex))
+ if (mfg.isEmpty || mfc.isEmpty) {
+ fail()
+ } else {
+ val score = Scorable.computeSimilarity(
+ ReduceFeatures(mfg.get.json),
+ ReduceFeatures(mfc.get.json))
+ (slug, score, mfg.get.json, mfc.get.json)
+ }
+ }
+
+ it should "have right output values" in {
+ //outputBuffer.exists(_ == bundle("title1", 0, 0))
+ //outputBuffer.exists(_ == bundle("title1", 0, 2))
+ //outputBuffer.exists(_ == bundle("title1", 0, 1))
+ outputBuffer.exists(_ == bundle("title2tng", 1, 3))
+ }
+ }
+ .run
+ .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
new file mode 100644
index 0000000..410819b
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
@@ -0,0 +1,85 @@
+package sandcrawler
+
+import org.scalatest._
+
+class StringUtilitiesTest extends FlatSpec with Matchers {
+ "removeAccents()" should "handle the empty string" in {
+ StringUtilities.removeAccents("") shouldBe ""
+ }
+
+ it should "not change a string with unaccented characters" in {
+ StringUtilities.removeAccents("abc123") shouldBe "abc123"
+ }
+
+ it should "remove accents from Ls" in {
+ StringUtilities.removeAccents("E\u0141\u0142en") shouldBe "ELlen"
+ }
+
+ it should "remove accents from Es without changing case" in {
+ val result = StringUtilities.removeAccents("\u00e9")
+ result should have length 1
+ result shouldBe "e"
+ }
+
+ it should "convert the ø in Soren" in {
+ StringUtilities.removeAccents("Søren") shouldBe "Soren"
+ StringUtilities.removeAccents("SØREN") shouldBe "SOREN"
+ }
+
+ "removePunctuation" should "work on the empty string" in {
+ StringUtilities.removePunctuation("") shouldBe ""
+ }
+
+ it should "work on non-empty text strings" in {
+ StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world"
+ StringUtilities.removePunctuation(":-)") shouldBe ""
+ StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab"
+ }
+
+ // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ "stringDistance" should "work on empty strings" in {
+ StringUtilities.stringDistance("", "") shouldBe 0
+ StringUtilities.stringDistance("a", "") shouldBe 1
+ StringUtilities.stringDistance("", "a") shouldBe 1
+ StringUtilities.stringDistance("abc", "") shouldBe 3
+ StringUtilities.stringDistance("", "abc") shouldBe 3
+ }
+
+ it should "work on equal strings" in {
+ StringUtilities.stringDistance("", "") shouldBe 0
+ StringUtilities.stringDistance("a", "a") shouldBe 0
+ StringUtilities.stringDistance("abc", "abc") shouldBe 0
+ }
+
+ it should "work where only inserts are needed" in {
+ StringUtilities.stringDistance("", "a") shouldBe 1
+ StringUtilities.stringDistance("a", "ab") shouldBe 1
+ StringUtilities.stringDistance("b", "ab") shouldBe 1
+ StringUtilities.stringDistance("ac", "abc") shouldBe 1
+ StringUtilities.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6
+ }
+
+ it should "work where only deletes are needed" in {
+ StringUtilities.stringDistance( "a", "") shouldBe 1
+ StringUtilities.stringDistance( "ab", "a") shouldBe 1
+ StringUtilities.stringDistance( "ab", "b") shouldBe 1
+ StringUtilities.stringDistance("abc", "ac") shouldBe 1
+ StringUtilities.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6
+ }
+
+ it should "work where only substitutions are needed" in {
+ StringUtilities.stringDistance( "a", "b") shouldBe 1
+ StringUtilities.stringDistance( "ab", "ac") shouldBe 1
+ StringUtilities.stringDistance( "ac", "bc") shouldBe 1
+ StringUtilities.stringDistance("abc", "axc") shouldBe 1
+ StringUtilities.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6
+ }
+
+ it should "work where many operations are needed" in {
+ StringUtilities.stringDistance("example", "samples") shouldBe 3
+ StringUtilities.stringDistance("sturgeon", "urgently") shouldBe 6
+ StringUtilities.stringDistance("levenshtein", "frankenstein") shouldBe 6
+ StringUtilities.stringDistance("distance", "difference") shouldBe 5
+ StringUtilities.stringDistance("java was neat", "scala is great") shouldBe 7
+ }
+}
diff --git a/sql/README.md b/sql/README.md
new file mode 100644
index 0000000..1d53d6d
--- /dev/null
+++ b/sql/README.md
@@ -0,0 +1,160 @@
+
+TL;DR: replace hbase with postgresql tables with REST API (http://postgrest.org)
+
+No primary storage of anything in this table. Everything should be rapidly
+re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc.
+This is a secondary view on all of that.
+
+## Create Database and User
+
+Create system user with your username like:
+
+ sudo su postgres
+ createuser -s bnewbold
+
+Create database using `diesel` tool (see fatcat rust docs for install notes):
+
+ # DANGER: will delete/recreate entire database
+ diesel database reset
+
+In the future would probably be better to create a real role/password and
+supply these via `DATABASE_URL` env variable.
+
+## Schema
+
+ schema/database name is 'sandcrawler'
+
+ cdx: include revisits or not?
+ id: int64, PK
+ sha1hex: string, not null, index
+ cdx_sha1hex: string
+ url: string, not null
+ datetime: ISO 8601:1988 (string?), not null
+ mimetype: string
+ warc_path: string (item and filename)
+ warc_offset: i64
+ created: datetime, index (?)
+ ?crawl: string
+ ?domain: string
+
+ file_meta
+ sha1hex, string, PK
+ md5hex: string
+ sha256hex: string
+ size_bytes: i64
+ mime: string (verifying file status; optional for now?)
+
+ fatcat_file
+ sha1hex: string, PK
+ file_ident: string, index?
+ release_ident: ?
+
+ petabox
+ id: int64, PK
+ sha1hex: string, notnull, index
+ item: string, notnull
+ path: string, notnull (TODO: URL encoded? separate sub-archive path?)
+
+ grobid
+ sha1hex: string, PK
+ updated: datetime
+ grobid_version (string)
+ status_code: i32
+ status: string (JSONB?), only if status != 200
+ metadata: JSONB, title, first author, year (not for now?)
+ glutton_fatcat_release: string, index
+
+ shadow
+ sha1hex: string, PK
+ shadow_corpus: string, PK
+ shadow_id: string
+ doi: string
+ pmid: string
+ isbn13: string
+
+Alternatively, to be more like existing system could have "one big table" or
+multiple tables all with same key (sha1b32) and UNIQ. As is, every sha1 pk
+column is 40 bytes of both index and data, or 8+ GByte (combined) for each
+table with 100 million rows. using raw bytes could help, but makes all
+code/queries much trickier.
+
+Should we have "created" or "updated" timestamps on all these columns to enable
+kafka tailing?
+
+TODO:
+- how to indicate CDX sha1 vs. true sha1 mis-match? pretty rare. recrawl and delete row from `gwb_cdx`?
+- only most recent GROBID? or keep multiple versions? here and minio
+
+## Existing Stuff Sizes (estimates)
+
+ 78.5G /user/bnewbold/journal_crawl_cdx
+ 19.7G /user/bnewbold/sandcrawler/output-prod/2018-12-14-1737.00-dumpfilemeta
+ 2.7G file_hashes.tsv
+ 228.5G /user/bnewbold/sandcrawler/output-prod/2018-09-23-0405.30-dumpgrobidmetainsertable
+
+## Use Cases
+
+Core goal here is to mostly kill hbase/hadoop. What jobs are actually used there?
+
+- backfill: load in-scope (fulltext) crawl results from CDX
+ => bulk (many line) inserts
+- rowcount: "how many unique PDFs crawled?"
+ => trivial SQL query
+- status code count: "how much GROBID progress?"
+ => trivial SQL query
+- dumpungrobided: "what files still need to be processed"
+ => SQL join with a "first" on CDX side
+- dumpgrobidxml: "merge CDX/file info with extracted XML, for those that were successful"
+ => SQL dump or rowscan, then minio fetches
+
+This table is generally "single file raw fulltext metadata".
+
+"Enrichment" jobs:
+
+- GROBID
+- glutton (if not GROBID)
+- extra file metadata
+- match newly enriched files to fatcat
+
+What else?
+
+- track additional raw file metadata
+- dump all basic GROBID metadata (title, authors, year) to attempt merge/match
+
+Questions we might want to answer
+
+- total size of PDF corpus (terabytes)
+- unqiue files hit per domain
+
+## Prototype Plan
+
+- backfill all CDX crawl files (TSV transform?)
+- load full GROBID XML (both into minio and into SQL)
+- load full fatcat file dump (TSV transform)
+- load dumpfilemeta
+
+## Example Useful Lookups
+
+
+ http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES
+ http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278
+
+
+## Full SQL Database Dumps
+
+Run a dump in compressed, postgres custom format:
+
+ export DATESLUG="`date +%Y-%m-%d.%H%M%S`"
+ time sudo -u postgres pg_dump --verbose --format=custom sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump
+
+As of 2021-04-07, this process runs for about 4 hours and the compressed
+snapshot is 88 GBytes (compared with 551.34G database disk consumption).
+
+To restore a dump (which will delete local database content, if any):
+
+ sudo su postgres
+ createuser --no-login web_anon
+ createuser -s sandcrawler
+ time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump
+
+Took about 2.5 hours.
diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md
new file mode 100644
index 0000000..f1a5f86
--- /dev/null
+++ b/sql/backfill/backfill.md
@@ -0,0 +1,135 @@
+
+SQL Backfill Notes
+-----------------------
+
+GROBID is going to be somewhat complex.
+
+TODO:
+x CDX backfill script (CDX to postgresql direct, bulk inserts, python)
+x `file_meta` bulk insert (TSV to postgresql direct, bulk upserts, python)
+x GROBID insert (python, dump TSV to minio then postgresql)
+
+## `cdx`
+
+ #cat example.cdx | rg ' 200 ' | cut -d' ' -f2,3,4,6,9,10,11
+ #cat example.cdx | rg ' 200 ' | awk '{print $6 "\t" $3 "\t" $2 "\t" $4 "\t\t" $6 "\t" $11 "\t" $9 "\t" $10}' | b32_hex.py | awk '{print $2 "\t" $3 "\t" $4 "\t" $1 "\t" $6 "\t" $7 "\t" $8}' > cdx.example.tsv
+ cat example.cdx | ./filter_transform_cdx.py > cdx.example.tsv
+
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.example.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+
+Big HDFS import:
+
+ # but actually didn't import those; don't want to need to re-import
+ hdfs dfs -get journal_crawl_cdx/*
+
+ cat citeseerx_crawl_2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.citeseerx_crawl_2017.tsv
+ cat gwb-pdf-20171227034923-surt-filter/* | rg ' 200 ' | ./filter_transform_cdx.py > gwb-pdf-20171227034923-surt-filter.tsv
+ cat UNPAYWALL-PDF-CRAWL-2018-07.filtered.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.UNPAYWALL-PDF-CRAWL-2018-07.filtered.tsv
+ cat MSAG-PDF-CRAWL-2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.MSAG-PDF-CRAWL-2017.tsv
+
+ cat CORE-UPSTREAM-CRAWL-2018-11.sorted.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.CORE-UPSTREAM-CRAWL-2018-11.sorted.tsv
+ cat DIRECT-OA-CRAWL-2019.pdfs.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.DIRECT-OA-CRAWL-2019.pdfs.tsv
+ cat DOI-LANDING-CRAWL-2018-06.200_pdf.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.DOI-LANDING-CRAWL-2018-06.200_pdf.tsv
+ cat OA-JOURNAL-TESTCRAWL-TWO-2018.pdf.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.OA-JOURNAL-TESTCRAWL-TWO-2018.pdf.tsv
+ cat SEMSCHOLAR-PDF-CRAWL-2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.SEMSCHOLAR-PDF-CRAWL-2017.tsv
+ cat TARGETED-PDF-CRAWL-2017.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.TARGETED-PDF-CRAWL-2017.tsv
+ cat UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.cdx | rg ' 200 ' | ./filter_transform_cdx.py > cdx.UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.tsv
+
+TODO: nasty escaping?
+
+In psql:
+
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.UNPAYWALL-PDF-CRAWL-2018-07.filtered.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # ERROR
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.MSAG-PDF-CRAWL-2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # ERROR
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.citeseerx_crawl_2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 1653840
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.CORE-UPSTREAM-CRAWL-2018-11.sorted.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 2827563
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.DIRECT-OA-CRAWL-2019.pdfs.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 10651736
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.DOI-LANDING-CRAWL-2018-06.200_pdf.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 768565
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.OA-JOURNAL-TESTCRAWL-TWO-2018.pdf.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 5310017
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.SEMSCHOLAR-PDF-CRAWL-2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 2219839
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.TARGETED-PDF-CRAWL-2017.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # ERROR
+ COPY cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) FROM '/sandcrawler-db/backfill/cdx/cdx.UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # ERROR
+
+NOTE: these largely didn't work; will need to write a batch importer.
+
+Batch import process:
+
+ cat UNPAYWALL-PDF-CRAWL-2018-07.filtered.cdx MSAG-PDF-CRAWL-2017.cdx TARGETED-PDF-CRAWL-2017.cdx UNPAYWALL-PDF-CRAWL-2019-04.pdfs_sorted.cdx | ./backfill_cdx.py
+ # Done: Counter({'raw_lines': 123254127, 'total': 51365599, 'batches': 51365})
+
+## `fatcat_file`
+
+ zcat file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0]] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv
+
+In psql:
+
+ COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # => COPY 24727350
+
+## `file_meta`
+
+ zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv
+
+In psql:
+
+ COPY file_meta FROM '/sandcrawler-db/backfill/file_meta.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # -> COPY 5860092
+
+## `petabox`
+
+ zcat /fast/download/file_export.2019-07-07.json.gz | rg '//archive.org/' | pigz > /fast/download/file_export.2019-07-07.petabox.json.gz
+ zcat /fast/download/file_export.2019-07-07.petabox.json.gz | ./petabox_transform.py | sort -u -S 8G | awk '{print $3 "\t" $1 "\t" $2}' | uniq -s40 | awk '{print $2 "\t" $3 "\t" $1}' > petabox.fatcat_2019-07-07.tsv
+
+In psql:
+
+ COPY petabox FROM '/sandcrawler-db/backfill/petabox.fatcat_2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # -> COPY 2887834
+
+## `grobid`
+
+Quick test:
+
+ zcat /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part-00000.gz | cut -f2 | head | ./backfill_grobid.py
+
+Run big batch:
+
+ ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | cut -f2 | ./backfill_grobid.py'
+ # [...]
+ # Done: Counter({'minio-success': 161605, 'total': 161605, 'raw_lines': 161605, 'batches': 161})
+ # [...]
+
+Was running slow with lots of iowait and 99% jdb2. This seems to be disk I/O. Going to try:
+
+ sudo mount /dev/sdc1 /sandcrawler-minio/ -o data=writeback,noatime,nobarrier
+
+ # -j8: 20+ M/s write, little jdb2
+ # -j16: 30+ M/s write, little jdb2
+ # -j12: 30+ M/s write, going with this
+
+For general use should go back to:
+
+ sudo mount /dev/sdc1 /sandcrawler-minio/ -o data=noatime
+
+ # -j4: Still pretty slow, only ~3-5 M/s disk write. jbd2 consistently at 99%, 360 K/s write
+
+## rough table sizes
+
+ table_name | table_size | indexes_size | total_size
+ --------------------------------------------------------------+------------+--------------+------------
+ "public"."cdx" | 11 GB | 8940 MB | 20 GB
+ "public"."shadow" | 8303 MB | 7205 MB | 15 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."file_meta" | 814 MB | 382 MB | 1196 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ [...]
+
diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py
new file mode 100755
index 0000000..f929502
--- /dev/null
+++ b/sql/backfill/backfill_cdx.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+This is a "one-time" tranform helper script for CDX backfill into sandcrawler
+postgresql.
+
+Most of this file was copied from '../python/common.py'.
+"""
+
+import json, os, sys, collections
+import base64
+import psycopg2
+import psycopg2.extras
+
+NORMAL_MIME = (
+ 'application/pdf',
+ 'application/postscript',
+ 'text/html',
+ 'text/xml',
+)
+
+def normalize_mime(raw):
+ raw = raw.lower()
+ for norm in NORMAL_MIME:
+ if raw.startswith(norm):
+ return norm
+
+ # Special cases
+ if raw.startswith('application/xml'):
+ return 'text/xml'
+ if raw.startswith('application/x-pdf'):
+ return 'application/pdf'
+ return None
+
+
+def test_normalize_mime():
+ assert normalize_mime("asdf") is None
+ assert normalize_mime("application/pdf") == "application/pdf"
+ assert normalize_mime("application/pdf+journal") == "application/pdf"
+ assert normalize_mime("Application/PDF") == "application/pdf"
+ assert normalize_mime("application/p") is None
+ assert normalize_mime("application/xml+stuff") == "text/xml"
+ assert normalize_mime("application/x-pdf") == "application/pdf"
+ assert normalize_mime("application/x-html") is None
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+
+def parse_cdx_line(raw_cdx):
+
+ cdx = raw_cdx.split()
+ if len(cdx) < 11:
+ return None
+
+ surt = cdx[0]
+ dt = cdx[1]
+ url = cdx[2]
+ mime = normalize_mime(cdx[3])
+ http_status = cdx[4]
+ key = cdx[5]
+ c_size = cdx[8]
+ offset = cdx[9]
+ warc = cdx[10]
+
+ if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
+ and http_status == "200" and len(key) == 32 and dt.isdigit()
+ and mime != None):
+ return None
+
+ if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
+ return None
+
+ # these are the new/specific bits
+ sha1 = b32_hex(key)
+ return dict(url=url, datetime=dt, sha1hex=sha1, cdx_sha1hex=None, mimetype=mime, warc_path=warc, warc_csize=int(c_size), warc_offset=int(offset))
+
+def insert(cur, batch):
+ sql = """
+ INSERT INTO
+ cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT cdx_pkey DO NOTHING
+ RETURNING 1;
+ """
+ batch = [(d['url'], d['datetime'], d['sha1hex'], d['mimetype'],
+ d['warc_path'], d['warc_csize'], d['warc_offset'])
+ for d in batch]
+ res = psycopg2.extras.execute_values(cur, sql, batch) # fetch=True
+ #return len(res)
+
+def stdin_to_pg():
+ # no host means it will use local domain socket by default
+ conn = psycopg2.connect(database="sandcrawler", user="postgres")
+ cur = conn.cursor()
+ counts = collections.Counter({'total': 0})
+ batch = []
+ for l in sys.stdin:
+ l = l.strip()
+ if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
+ print("Progress: {}...".format(counts))
+ counts['raw_lines'] += 1
+ if not l:
+ continue
+ info = parse_cdx_line(l)
+ if not info:
+ continue
+ # XXX: filter to, eg, PDF or octet/stream (derp)
+ batch.append(info)
+ counts['total'] += 1
+ if len(batch) >= 1000:
+ insert(cur, batch)
+ conn.commit()
+ #counts['inserted'] += i
+ #counts['existing'] += len(batch) - i
+ batch = []
+ counts['batches'] += 1
+ if batch:
+ insert(cur, batch)
+ #counts['inserted'] += i
+ #counts['existing'] += len(batch) - i
+ batch = []
+ conn.commit()
+ cur.close()
+ print("Done: {}".format(counts))
+
+if __name__=='__main__':
+ stdin_to_pg()
diff --git a/sql/backfill/backfill_file_meta.py b/sql/backfill/backfill_file_meta.py
new file mode 100755
index 0000000..e3b40a0
--- /dev/null
+++ b/sql/backfill/backfill_file_meta.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+This is a "one-time" tranform helper script for file_meta backfill into
+sandcrawler postgresql.
+
+Most of this file was copied from '../python/common.py'.
+"""
+
+import json, os, sys, collections
+import psycopg2
+import psycopg2.extras
+
+
+def insert(cur, batch):
+ sql = """
+ INSERT INTO
+ file_meta
+ VALUES %s
+ ON CONFLICT DO NOTHING;
+ """
+ res = psycopg2.extras.execute_values(cur, sql, batch)
+
+def stdin_to_pg():
+ # no host means it will use local domain socket by default
+ conn = psycopg2.connect(database="sandcrawler", user="postgres")
+ cur = conn.cursor()
+ counts = collections.Counter({'total': 0})
+ batch = []
+ for l in sys.stdin:
+ if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
+ print("Progress: {}...".format(counts))
+ counts['raw_lines'] += 1
+ if not l.strip():
+ continue
+ info = l.split("\t")
+ if not info:
+ continue
+ assert len(info) == 5
+ info[-1] = info[-1].strip() or None
+ batch.append(info)
+ counts['total'] += 1
+ if len(batch) >= 1000:
+ insert(cur, batch)
+ conn.commit()
+ batch = []
+ counts['batches'] += 1
+ if batch:
+ insert(cur, batch)
+ batch = []
+ conn.commit()
+ cur.close()
+ print("Done: {}".format(counts))
+
+if __name__=='__main__':
+ stdin_to_pg()
diff --git a/sql/backfill/backfill_grobid.py b/sql/backfill/backfill_grobid.py
new file mode 100755
index 0000000..08fad7f
--- /dev/null
+++ b/sql/backfill/backfill_grobid.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""
+This is a "one-time" tranform helper script for GROBID backfill into
+sandcrawler minio and postgresql.
+"""
+
+import json, os, sys, collections, io
+import base64
+import requests
+from minio import Minio
+import psycopg2
+import psycopg2.extras
+
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def insert(cur, batch):
+ sql = """
+ INSERT INTO
+ grobid (sha1hex, grobid_version, status_code, status, fatcat_release, metadata)
+ VALUES %s
+ ON CONFLICT DO NOTHING;
+ """
+ batch = [(d['sha1hex'], d['grobid_version'], d['status_code'], d['status'], d['fatcat_release'], d['metadata'])
+ for d in batch]
+ res = psycopg2.extras.execute_values(cur, sql, batch)
+
+def stdin_to_pg():
+ mc = Minio('localhost:9000',
+ access_key=os.environ['MINIO_ACCESS_KEY'],
+ secret_key=os.environ['MINIO_SECRET_KEY'],
+ secure=False)
+ # no host means it will use local domain socket by default
+ conn = psycopg2.connect(database="sandcrawler", user="postgres")
+ cur = conn.cursor()
+ counts = collections.Counter({'total': 0})
+ batch = []
+ for l in sys.stdin:
+ if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
+ print("Progress: {}...".format(counts))
+ counts['raw_lines'] += 1
+ l = l.strip()
+ if not l:
+ continue
+ row = json.loads(l)
+ if not row:
+ continue
+ sha1hex = b32_hex(row['pdf_hash'])
+ grobid_xml = row['tei_xml'].encode('utf-8')
+ grobid_xml_len = len(grobid_xml)
+ grobid_xml = io.BytesIO(grobid_xml)
+
+ key = "{}/{}/{}.tei.xml".format(
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex)
+ mc.put_object("grobid", key, grobid_xml, grobid_xml_len,
+ content_type="application/tei+xml",
+ metadata=None)
+ counts['minio-success'] += 1
+
+ info = dict(
+ sha1hex=sha1hex,
+ grobid_version=None, # TODO
+ status_code=200,
+ status=None,
+ fatcat_release=None,
+ metadata=None,
+ )
+ batch.append(info)
+ counts['total'] += 1
+ if len(batch) >= 1000:
+ insert(cur, batch)
+ conn.commit()
+ batch = []
+ counts['batches'] += 1
+ if batch:
+ insert(cur, batch)
+ batch = []
+ conn.commit()
+ cur.close()
+ print("Done: {}".format(counts))
+
+if __name__=='__main__':
+ stdin_to_pg()
diff --git a/sql/backfill/backfill_grobid_unpaywall.py b/sql/backfill/backfill_grobid_unpaywall.py
new file mode 100755
index 0000000..58e9e3c
--- /dev/null
+++ b/sql/backfill/backfill_grobid_unpaywall.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+This is a "one-time" tranform helper script for GROBID backfill into
+sandcrawler minio and postgresql.
+
+This variant of backfill_grobid.py pushes into the unpaywall bucket of
+sandcrawler-minio and doesn't push anything to sandcrawler table in general.
+"""
+
+import json, os, sys, collections, io
+import base64
+import requests
+from minio import Minio
+import psycopg2
+import psycopg2.extras
+
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def stdin_to_minio():
+ mc = Minio('localhost:9000',
+ access_key=os.environ['MINIO_ACCESS_KEY'],
+ secret_key=os.environ['MINIO_SECRET_KEY'],
+ secure=False)
+ counts = collections.Counter({'total': 0})
+ for l in sys.stdin:
+ if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
+ print("Progress: {}...".format(counts))
+ counts['raw_lines'] += 1
+ l = l.strip()
+ if not l:
+ continue
+ row = json.loads(l)
+ if not row:
+ continue
+ sha1hex = b32_hex(row['pdf_hash'])
+ grobid_xml = row['tei_xml'].encode('utf-8')
+ grobid_xml_len = len(grobid_xml)
+ grobid_xml = io.BytesIO(grobid_xml)
+
+ key = "grobid/{}/{}/{}.tei.xml".format(
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex)
+ mc.put_object("unpaywall", key, grobid_xml, grobid_xml_len,
+ content_type="application/tei+xml",
+ metadata=None)
+ counts['minio-success'] += 1
+
+ print("Done: {}".format(counts))
+
+if __name__=='__main__':
+ stdin_to_minio()
diff --git a/sql/backfill/filter_transform_cdx.py b/sql/backfill/filter_transform_cdx.py
new file mode 100755
index 0000000..3507dfc
--- /dev/null
+++ b/sql/backfill/filter_transform_cdx.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+This is a "one-time" tranform helper script for CDX backfill into sandcrawler
+postgresql.
+
+Most of this file was copied from '../python/common.py'.
+"""
+
+import json, os, sys
+import base64
+
+NORMAL_MIME = (
+ 'application/pdf',
+ 'application/postscript',
+ 'text/html',
+ 'text/xml',
+)
+
+def normalize_mime(raw):
+ raw = raw.lower()
+ for norm in NORMAL_MIME:
+ if raw.startswith(norm):
+ return norm
+
+ # Special cases
+ if raw.startswith('application/xml'):
+ return 'text/xml'
+ if raw.startswith('application/x-pdf'):
+ return 'application/pdf'
+ return None
+
+
+def test_normalize_mime():
+ assert normalize_mime("asdf") is None
+ assert normalize_mime("application/pdf") == "application/pdf"
+ assert normalize_mime("application/pdf+journal") == "application/pdf"
+ assert normalize_mime("Application/PDF") == "application/pdf"
+ assert normalize_mime("application/p") is None
+ assert normalize_mime("application/xml+stuff") == "text/xml"
+ assert normalize_mime("application/x-pdf") == "application/pdf"
+ assert normalize_mime("application/x-html") is None
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+
+def parse_cdx_line(raw_cdx):
+
+ cdx = raw_cdx.split()
+ if len(cdx) < 11:
+ return None
+
+ surt = cdx[0]
+ dt = cdx[1]
+ url = cdx[2]
+ mime = normalize_mime(cdx[3])
+ http_status = cdx[4]
+ key = cdx[5]
+ c_size = cdx[8]
+ offset = cdx[9]
+ warc = cdx[10]
+
+ if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
+ and http_status == "200" and len(key) == 32 and dt.isdigit()
+ and mime != None):
+ return None
+
+ if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
+ return None
+
+ # these are the new/specific bits
+ sha1 = b32_hex(key)
+ return dict(url=url, datetime=dt, sha1hex=sha1, cdx_sha1hex=None, mimetype=mime, warc_path=warc, warc_csize=int(c_size), warc_offset=int(offset))
+
+for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ info = parse_cdx_line(l)
+ if not info:
+ continue
+ print("\t".join([info['url'], info['datetime'], info['sha1hex'], info['mimetype'], info['warc_path'], str(info['warc_csize']), str(info['warc_offset'])]))
+
diff --git a/sql/backfill/petabox_transform.py b/sql/backfill/petabox_transform.py
new file mode 100755
index 0000000..b638911
--- /dev/null
+++ b/sql/backfill/petabox_transform.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+import json, sys, os
+
+for l in sys.stdin.readlines():
+ l = l.strip()
+ if not l:
+ continue
+ r = json.loads(l)
+ if not r['sha1']:
+ continue
+ sha1hex = r['sha1']
+ for url in r['urls']:
+ u = url['url']
+ if not '//archive.org/' in u:
+ continue
+ u = u.split('/')
+ if u[2] == 'web.archive.org':
+ continue
+ #print(u)
+ assert u[2] == 'archive.org' and u[3] in ('download', 'serve')
+ item = u[4]
+ path = '/'.join(u[5:])
+ print("\t".join([item, path, sha1hex]))
diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql
new file mode 100644
index 0000000..a7d6c2b
--- /dev/null
+++ b/sql/dump_file_meta.sql
@@ -0,0 +1,12 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT sha1hex, row_to_json(file_meta)
+ FROM file_meta
+ ORDER BY sha1hex ASC
+)
+TO '/srv/sandcrawler/tasks/file_meta_dump.tsv'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_regrobid_pdf.sql b/sql/dump_regrobid_pdf.sql
new file mode 100644
index 0000000..b846834
--- /dev/null
+++ b/sql/dump_regrobid_pdf.sql
@@ -0,0 +1,15 @@
+
+-- Run like:
+-- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx
+ WHERE cdx.mimetype = 'application/pdf'
+ AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
+)
+TO STDOUT
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql
new file mode 100644
index 0000000..e7c48f3
--- /dev/null
+++ b/sql/dump_regrobid_pdf_petabox.sql
@@ -0,0 +1,15 @@
+
+-- Run like:
+-- psql sandcrawler < dump_regrobid_pdf_petabox.sql
+-- cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
+ WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
+)
+TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
new file mode 100644
index 0000000..917d88b
--- /dev/null
+++ b/sql/dump_reingest_quarterly.sql
@@ -0,0 +1,31 @@
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '8 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '91 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'wayback-error'
+ OR ingest_file_result.status like 'wayback-content-error'
+ OR ingest_file_result.status like 'petabox-error'
+ OR ingest_file_result.status like 'gateway-timeout'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql
new file mode 100644
index 0000000..a8ed72f
--- /dev/null
+++ b/sql/dump_reingest_spn.sql
@@ -0,0 +1,25 @@
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '2 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '31 day'::INTERVAL
+ AND ingest_request.ingest_request_source = 'savepapernow-web'
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status like 'cdx-error'
+ -- OR ingest_file_result.status like 'wayback-error'
+ -- OR ingest_file_result.status like 'wayback-content-error'
+ OR ingest_file_result.status like 'petabox-error'
+ -- OR ingest_file_result.status like 'gateway-timeout'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json';
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
new file mode 100644
index 0000000..65800eb
--- /dev/null
+++ b/sql/dump_reingest_weekly.sql
@@ -0,0 +1,31 @@
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '8 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '8 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status like 'cdx-error'
+ -- OR ingest_file_result.status like 'wayback-error'
+ -- OR ingest_file_result.status like 'wayback-content-error'
+ OR ingest_file_result.status like 'petabox-error'
+ -- OR ingest_file_result.status like 'gateway-timeout'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql
new file mode 100644
index 0000000..a7fb920
--- /dev/null
+++ b/sql/dump_unextracted_pdf.sql
@@ -0,0 +1,22 @@
+
+-- Run like:
+-- psql sandcrawler < dump_unextracted_pdf.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+)
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql
new file mode 100644
index 0000000..bb9f162
--- /dev/null
+++ b/sql/dump_unextracted_pdf_petabox.sql
@@ -0,0 +1,18 @@
+
+-- Run like:
+-- psql sandcrawler < dump_unextracted_pdf_petabox.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM grobid
+ LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE petabox.sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+)
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql
new file mode 100644
index 0000000..81caf18
--- /dev/null
+++ b/sql/dump_ungrobid_pdf.sql
@@ -0,0 +1,18 @@
+
+-- Run like:
+-- psql sandcrawler < dump_ungrobid_pdf.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM cdx
+ WHERE cdx.mimetype = 'application/pdf'
+ AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
+ -- uncomment/comment this to control whether only fatcat files are included
+ --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
+)
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql
new file mode 100644
index 0000000..b7a1db2
--- /dev/null
+++ b/sql/dump_ungrobid_pdf_petabox.sql
@@ -0,0 +1,17 @@
+
+-- Run like:
+-- psql sandcrawler < dump_ungrobid_pdf_petabox.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM petabox
+ WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
+ -- uncomment/comment this to control whether only fatcat files are included
+ AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex)
+)
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql
new file mode 100644
index 0000000..333ff7b
--- /dev/null
+++ b/sql/dump_unmatched_glutton_pdf.sql
@@ -0,0 +1,19 @@
+
+-- Run like:
+-- psql sandcrawler < THING.sql > THING.2019-09-23.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(grobid)
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL
+ LIMIT 1000
+)
+TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json';
+--TO STDOUT
+--WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/example.env b/sql/example.env
new file mode 100644
index 0000000..3a13689
--- /dev/null
+++ b/sql/example.env
@@ -0,0 +1 @@
+DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler"
diff --git a/sql/ingest_again.md b/sql/ingest_again.md
new file mode 100644
index 0000000..b749557
--- /dev/null
+++ b/sql/ingest_again.md
@@ -0,0 +1,158 @@
+
+## re-ingest some broken
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-%'
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
+ ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+ AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'wayback-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'gateway-timeout'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'petabox-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json';
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
+
+Push to kafka (shuffled):
+
+ cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+ cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0
+
+ cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Push to kafka (not shuffled):
+
+ cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## just recent fatcat-ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ -- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND (ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'gateway-timeout'
+ OR ingest_file_result.status like 'wayback-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json';
+
+ # note: shuf
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
+
+ cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## specific domains
+
+protocols.io:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url LIKE '%10.17504/protocols.io%'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+biorxiv/medrxiv:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url LIKE '%10.1101/20%'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt
new file mode 100644
index 0000000..b684400
--- /dev/null
+++ b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt
@@ -0,0 +1,326 @@
+ doi_prefix | status | count
+------------+-------------------------------+--------
+ 10.1001 | | 230
+ 10.1002 | | 3914
+ 10.1002 | terminal-bad-status | 1540
+ 10.1002 | forbidden | 1072
+ 10.1002 | redirect-loop | 995
+ 10.1002 | no-pdf-link | 210
+ 10.1016 | | 7976
+ 10.1016 | no-pdf-link | 4648
+ 10.1016 | terminal-bad-status | 1778
+ 10.1016 | forbidden | 622
+ 10.1016 | spn2-error:too-many-redirects | 344
+ 10.1016 | redirect-loop | 225
+ 10.1017 | | 2040
+ 10.1017 | no-pdf-link | 720
+ 10.1017 | success | 441
+ 10.1017 | link-loop | 371
+ 10.1017 | bad-redirect | 227
+ 10.1021 | | 1722
+ 10.1021 | blocked-cookie | 1552
+ 10.1029 | | 248
+ 10.1039 | | 1160
+ 10.1039 | redirect-loop | 486
+ 10.1039 | spn2-error:too-many-redirects | 395
+ 10.1039 | spn2-wayback-error | 213
+ 10.1051 | | 695
+ 10.1051 | success | 557
+ 10.1055 | | 541
+ 10.1055 | not-found | 295
+ 10.1055 | redirect-loop | 213
+ 10.1057 | | 2835
+ 10.1057 | redirect-loop | 2617
+ 10.1061 | | 550
+ 10.1061 | spn2-error:too-many-redirects | 425
+ 10.1063 | | 600
+ 10.1063 | spn2-error:too-many-redirects | 328
+ 10.1080 | | 3801
+ 10.1080 | blocked-cookie | 2431
+ 10.1080 | terminal-bad-status | 711
+ 10.1080 | forbidden | 341
+ 10.1081 | | 299
+ 10.1081 | link-loop | 222
+ 10.1089 | | 236
+ 10.1089 | blocked-cookie | 228
+ 10.1093 | | 12805
+ 10.1093 | link-loop | 8627
+ 10.1093 | redirect-loop | 1659
+ 10.1093 | no-pdf-link | 1475
+ 10.1093 | bad-redirect | 428
+ 10.1093 | success | 391
+ 10.1097 | | 1497
+ 10.1097 | no-pdf-link | 503
+ 10.1097 | link-loop | 346
+ 10.1097 | spn2-error:too-many-redirects | 259
+ 10.1097 | terminal-bad-status | 202
+ 10.1101 | | 1859
+ 10.1101 | redirect-loop | 993
+ 10.1101 | forbidden | 703
+ 10.1103 | | 597
+ 10.1103 | not-found | 534
+ 10.1108 | | 1055
+ 10.1108 | no-pdf-link | 945
+ 10.1109 | | 7067
+ 10.1109 | spn2-error:too-many-redirects | 6299
+ 10.1109 | success | 667
+ 10.1111 | | 2099
+ 10.1111 | redirect-loop | 1331
+ 10.1111 | terminal-bad-status | 313
+ 10.1111 | forbidden | 226
+ 10.1115 | | 1278
+ 10.1115 | bad-redirect | 707
+ 10.1117 | | 561
+ 10.1117 | spn2-error:too-many-redirects | 501
+ 10.1126 | | 214
+ 10.1136 | | 1989
+ 10.1136 | success | 1463
+ 10.1136 | link-loop | 294
+ 10.1142 | | 300
+ 10.1142 | blocked-cookie | 237
+ 10.1145 | | 440
+ 10.1145 | blocked-cookie | 354
+ 10.1155 | | 480
+ 10.1155 | success | 474
+ 10.11588 | | 506
+ 10.11588 | no-pdf-link | 264
+ 10.11588 | success | 236
+ 10.1159 | | 226
+ 10.11606 | | 304
+ 10.1161 | | 1142
+ 10.1161 | blocked-cookie | 1011
+ 10.1163 | | 2261
+ 10.1163 | link-loop | 1767
+ 10.1163 | success | 348
+ 10.11648 | | 405
+ 10.11648 | success | 404
+ 10.1182 | | 2125
+ 10.1182 | no-pdf-link | 2024
+ 10.1183 | | 987
+ 10.1183 | redirect-loop | 838
+ 10.1186 | | 1481
+ 10.1186 | success | 1412
+ 10.1201 | | 7649
+ 10.1201 | link-loop | 5383
+ 10.1201 | forbidden | 1504
+ 10.1201 | no-pdf-link | 312
+ 10.1299 | | 264
+ 10.1299 | no-pdf-link | 209
+ 10.13134 | | 201
+ 10.1353 | | 549
+ 10.1353 | terminal-bad-status | 443
+ 10.1371 | | 552
+ 10.1371 | success | 542
+ 10.14201 | | 656
+ 10.14201 | success | 366
+ 10.14361 | | 647
+ 10.14361 | link-loop | 585
+ 10.14746 | | 260
+ 10.14746 | success | 232
+ 10.1504 | | 527
+ 10.1504 | no-pdf-link | 501
+ 10.15122 | | 246
+ 10.15122 | success | 243
+ 10.1515 | | 16240
+ 10.1515 | link-loop | 12589
+ 10.1515 | success | 1941
+ 10.1515 | no-pdf-link | 1008
+ 10.1515 | not-found | 283
+ 10.15405 | | 229
+ 10.15405 | success | 218
+ 10.1553 | | 418
+ 10.1553 | no-pdf-link | 396
+ 10.1590 | | 655
+ 10.1590 | success | 623
+ 10.17104 | | 1202
+ 10.17104 | no-pdf-link | 953
+ 10.17104 | bad-redirect | 249
+ 10.17605 | | 368
+ 10.17605 | not-found | 337
+ 10.17615 | | 9401
+ 10.17615 | redirect-loop | 5720
+ 10.17615 | spn2-wayback-error | 3099
+ 10.17615 | spn2-cdx-lookup-failure | 201
+ 10.17863 | | 438
+ 10.18148 | | 465
+ 10.18148 | success | 462
+ 10.18720 | | 210
+ 10.18821 | | 476
+ 10.18821 | redirect-loop | 366
+ 10.20345 | | 222
+ 10.20345 | terminal-bad-status | 215
+ 10.20546 | | 244
+ 10.20546 | no-pdf-link | 241
+ 10.21037 | | 232
+ 10.2118 | | 903
+ 10.2118 | redirect-loop | 853
+ 10.21203 | | 1824
+ 10.21203 | success | 1545
+ 10.2139 | | 1493
+ 10.2139 | link-loop | 1145
+ 10.2147 | | 318
+ 10.2147 | success | 267
+ 10.2172 | | 282
+ 10.2174 | | 363
+ 10.2174 | no-pdf-link | 320
+ 10.2196 | | 265
+ 10.2208 | | 299
+ 10.22215 | | 218
+ 10.22215 | success | 217
+ 10.22323 | | 289
+ 10.22323 | success | 262
+ 10.22533 | | 395
+ 10.22533 | success | 393
+ 10.22541 | | 291
+ 10.22541 | success | 275
+ 10.23919 | | 426
+ 10.23919 | spn2-error:too-many-redirects | 403
+ 10.24034 | | 319
+ 10.24034 | spn2-error | 203
+ 10.24355 | | 15360
+ 10.24355 | no-pdf-link | 15228
+ 10.24411 | | 1506
+ 10.24411 | forbidden | 823
+ 10.24411 | redirect-loop | 647
+ 10.25335 | | 550
+ 10.25335 | no-pdf-link | 550
+ 10.25365 | | 429
+ 10.25365 | success | 424
+ 10.25384 | | 338
+ 10.25384 | success | 249
+ 10.25646 | | 239
+ 10.26197 | no-pdf-link | 303
+ 10.26197 | | 303
+ 10.26226 | | 272
+ 10.26278 | | 1291
+ 10.26278 | redirect-loop | 756
+ 10.26278 | spn2-error:too-many-redirects | 509
+ 10.29327 | | 232
+ 10.2991 | | 307
+ 10.2991 | spn2-wayback-error | 227
+ 10.30965 | | 722
+ 10.30965 | link-loop | 709
+ 10.3109 | | 801
+ 10.3109 | link-loop | 572
+ 10.3109 | forbidden | 228
+ 10.31219 | | 951
+ 10.31219 | redirect-loop | 518
+ 10.31219 | spn2-wayback-error | 356
+ 10.31274 | | 296
+ 10.31743 | | 403
+ 10.31743 | success | 294
+ 10.31857 | | 209
+ 10.3233 | | 471
+ 10.33448 | | 213
+ 10.33448 | success | 212
+ 10.3389 | | 1459
+ 10.3389 | success | 1417
+ 10.3390 | | 4511
+ 10.3390 | success | 3577
+ 10.3390 | terminal-bad-status | 485
+ 10.3390 | forbidden | 379
+ 10.3406 | | 243
+ 10.3406 | terminal-bad-status | 213
+ 10.34944 | | 527
+ 10.34944 | success | 459
+ 10.35016 | | 688
+ 10.35016 | no-pdf-link | 687
+ 10.36347 | success | 213
+ 10.36347 | | 213
+ 10.37747 | | 213
+ 10.37747 | no-pdf-link | 213
+ 10.37904 | | 227
+ 10.37904 | no-pdf-link | 226
+ 10.3917 | | 347
+ 10.3917 | redirect-loop | 208
+ 10.3923 | | 356
+ 10.3923 | redirect-loop | 254
+ 10.3929 | | 317
+ 10.3929 | terminal-bad-status | 310
+ 10.3931 | | 279
+ 10.3931 | no-pdf-link | 279
+ 10.4000 | | 7828
+ 10.4000 | success | 3485
+ 10.4000 | spn2-wayback-error | 2142
+ 10.4000 | redirect-loop | 2106
+ 10.4018 | | 249
+ 10.4018 | not-found | 240
+ 10.4103 | | 726
+ 10.4103 | remote-server-error | 343
+ 10.4103 | redirect-loop | 324
+ 10.4159 | | 286
+ 10.4159 | link-loop | 238
+ 10.4324 | | 19398
+ 10.4324 | link-loop | 12471
+ 10.4324 | forbidden | 3632
+ 10.4324 | not-found | 2283
+ 10.4324 | terminal-bad-status | 645
+ 10.4324 | success | 208
+ 10.47295 | | 456
+ 10.47295 | success | 449
+ 10.47513 | | 218
+ 10.47513 | no-pdf-link | 203
+ 10.48084 | success | 538
+ 10.48084 | | 538
+ 10.5040 | | 375
+ 10.5040 | no-pdf-link | 365
+ 10.5167 | | 290
+ 10.5167 | redirect-loop | 278
+ 10.5169 | | 360
+ 10.5169 | no-pdf-link | 355
+ 10.5194 | | 917
+ 10.5194 | success | 887
+ 10.5216 | | 213
+ 10.5220 | no-pdf-link | 397
+ 10.5220 | | 397
+ 10.5281 | | 22551
+ 10.5281 | terminal-bad-status | 12158
+ 10.5281 | success | 4901
+ 10.5281 | no-pdf-link | 4754
+ 10.5281 | spn2-error:unknown | 360
+ 10.5282 | | 228
+ 10.5451 | | 2068
+ 10.5451 | success | 1071
+ 10.5451 | terminal-bad-status | 817
+ 10.5753 | | 268
+ 10.5753 | success | 264
+ 10.5771 | | 941
+ 10.5771 | no-pdf-link | 397
+ 10.5771 | bad-redirect | 269
+ 10.5771 | link-loop | 238
+ 10.6068 | | 441
+ 10.6068 | no-pdf-link | 384
+ 10.6084 | | 917
+ 10.6084 | no-pdf-link | 520
+ 10.6084 | success | 368
+ 10.7287 | | 234
+ 10.7287 | no-pdf-link | 212
+ 10.7312 | | 382
+ 10.7312 | link-loop | 291
+ 10.7554 | | 205
+ 10.7891 | | 380
+ 10.7891 | no-pdf-link | 376
+ 10.7916 | | 331
+ 10.7916 | no-pdf-link | 201
+ 10.7939 | | 535
+ 10.7939 | no-pdf-link | 527
+ | | 272831
+ | success | 62298
+ | no-pdf-link | 60737
+ | link-loop | 48558
+ | redirect-loop | 26842
+ | terminal-bad-status | 22685
+ | spn2-error:too-many-redirects | 11174
+ | forbidden | 10900
+ | spn2-wayback-error | 7796
+ | blocked-cookie | 6961
+ | not-found | 5468
+ | bad-redirect | 2666
+ | spn2-error | 2398
+ | spn2-cdx-lookup-failure | 1374
+ | petabox-error | 678
+ | remote-server-error | 461
+ | wrong-mimetype | 443
+ | spn2-error:proxy-error | 420
+ | spn2-error:unknown | 360
+(323 rows)
diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt
new file mode 100644
index 0000000..28dd0d0
--- /dev/null
+++ b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt
@@ -0,0 +1,307 @@
+ domain | status | count
+-------------------------------------------------------------------+-------------------------------+--------
+ 202.148.31.178 | | 298
+ academic.oup.com | | 1624
+ academic.oup.com | no-pdf-link | 673
+ academic.oup.com | bad-redirect | 444
+ academic.oup.com | link-loop | 358
+ aip.scitation.org | | 257
+ apps.crossref.org | | 1414
+ apps.crossref.org | no-pdf-link | 1410
+ article.sciencepublishinggroup.com | | 404
+ article.sciencepublishinggroup.com | success | 404
+ arxiv.org | | 24340
+ arxiv.org | success | 22381
+ arxiv.org | terminal-bad-status | 1260
+ arxiv.org | no-pdf-link | 412
+ arxiv.org | no-capture | 262
+ ashpublications.org | | 2049
+ ashpublications.org | no-pdf-link | 2024
+ asmedigitalcollection.asme.org | | 1245
+ asmedigitalcollection.asme.org | bad-redirect | 707
+ assets.researchsquare.com | | 1549
+ assets.researchsquare.com | success | 1546
+ bioone.org | | 201
+ biorxiv.org | redirect-loop | 702
+ biorxiv.org | | 702
+ blogs.ethz.ch | | 687
+ blogs.ethz.ch | no-pdf-link | 686
+ books.openedition.org | | 446
+ books.openedition.org | redirect-loop | 382
+ brill.com | | 2203
+ brill.com | link-loop | 1779
+ brill.com | success | 359
+ catalog.paradisec.org.au | | 770
+ catalog.paradisec.org.au | redirect-loop | 756
+ cdr.lib.unc.edu | | 9432
+ cdr.lib.unc.edu | redirect-loop | 5720
+ cdr.lib.unc.edu | spn2-wayback-error | 3187
+ cdr.lib.unc.edu | spn2-cdx-lookup-failure | 201
+ classiques-garnier.com | | 246
+ classiques-garnier.com | success | 243
+ content.iospress.com | | 242
+ content.taylorfrancis.com | | 309
+ content.taylorfrancis.com | terminal-bad-status | 309
+ curve.carleton.ca | success | 201
+ curve.carleton.ca | | 201
+ cyberdoi.ru | redirect-loop | 647
+ cyberdoi.ru | | 647
+ czasopisma.kul.pl | | 402
+ czasopisma.kul.pl | success | 294
+ d.lib.msu.edu | | 550
+ d.lib.msu.edu | no-pdf-link | 550
+ d197for5662m48.cloudfront.net | success | 276
+ d197for5662m48.cloudfront.net | | 276
+ dergipark.org.tr | | 674
+ dergipark.org.tr | no-pdf-link | 255
+ dergipark.org.tr | success | 248
+ digi.ub.uni-heidelberg.de | no-pdf-link | 261
+ digi.ub.uni-heidelberg.de | | 261
+ dl.acm.org | | 441
+ dl.acm.org | blocked-cookie | 361
+ dlc.library.columbia.edu | | 201
+ dlc.library.columbia.edu | no-pdf-link | 201
+ doi.ala.org.au | | 308
+ doi.ala.org.au | no-pdf-link | 308
+ doi.org | | 474
+ doi.org | terminal-bad-status | 344
+ downloads.hindawi.com | | 479
+ downloads.hindawi.com | success | 478
+ edoc.rki.de | | 238
+ edoc.unibas.ch | | 2018
+ edoc.unibas.ch | success | 1067
+ edoc.unibas.ch | terminal-bad-status | 817
+ elib.spbstu.ru | | 205
+ elifesciences.org | | 204
+ era.library.ualberta.ca | | 531
+ era.library.ualberta.ca | no-pdf-link | 527
+ erj.ersjournals.com | | 951
+ erj.ersjournals.com | redirect-loop | 829
+ europepmc.org | | 289
+ europepmc.org | success | 283
+ figshare.com | | 233
+ figshare.com | no-pdf-link | 208
+ fjfsdata01prod.blob.core.windows.net | | 1430
+ fjfsdata01prod.blob.core.windows.net | success | 1418
+ hw.oeaw.ac.at | | 283
+ hw.oeaw.ac.at | no-pdf-link | 283
+ idb.ub.uni-tuebingen.de | | 216
+ idb.ub.uni-tuebingen.de | terminal-bad-status | 215
+ ieeexplore.ieee.org | | 7561
+ ieeexplore.ieee.org | spn2-error:too-many-redirects | 6732
+ ieeexplore.ieee.org | success | 683
+ ijgc.bmj.com | | 411
+ ijgc.bmj.com | success | 399
+ jamanetwork.com | | 229
+ jitc.bmj.com | | 849
+ jitc.bmj.com | success | 773
+ journals.aps.org | | 539
+ journals.aps.org | not-found | 534
+ journals.lww.com | | 1124
+ journals.lww.com | no-pdf-link | 547
+ journals.lww.com | link-loop | 399
+ journals.openedition.org | | 7366
+ journals.openedition.org | success | 3484
+ journals.openedition.org | spn2-wayback-error | 2120
+ journals.openedition.org | redirect-loop | 1720
+ journals.plos.org | | 552
+ journals.plos.org | success | 542
+ kiss.kstudy.com | | 306
+ kiss.kstudy.com | no-pdf-link | 292
+ lib.dr.iastate.edu | | 297
+ link.springer.com | | 2830
+ link.springer.com | redirect-loop | 2625
+ linkinghub.elsevier.com | | 970
+ linkinghub.elsevier.com | forbidden | 415
+ linkinghub.elsevier.com | spn2-error:too-many-redirects | 357
+ medrxiv.org | | 287
+ medrxiv.org | redirect-loop | 287
+ muse.jhu.edu | | 470
+ muse.jhu.edu | terminal-bad-status | 443
+ ojs.ub.uni-konstanz.de | | 463
+ ojs.ub.uni-konstanz.de | success | 462
+ onlinelibrary.wiley.com | | 2064
+ onlinelibrary.wiley.com | terminal-bad-status | 1973
+ osf.io | | 1394
+ osf.io | redirect-loop | 589
+ osf.io | spn2-wayback-error | 425
+ osf.io | not-found | 342
+ othes.univie.ac.at | | 424
+ othes.univie.ac.at | success | 424
+ oxford.universitypressscholarship.com | | 8999
+ oxford.universitypressscholarship.com | link-loop | 8282
+ oxford.universitypressscholarship.com | no-pdf-link | 695
+ oxfordhandbooks.com | redirect-loop | 460
+ oxfordhandbooks.com | | 460
+ papers.ssrn.com | | 1313
+ papers.ssrn.com | link-loop | 1145
+ peerj.com | | 313
+ peerj.com | no-pdf-link | 212
+ periodicos.urca.br | | 446
+ periodicos.urca.br | success | 439
+ pos.sissa.it | | 277
+ pos.sissa.it | success | 262
+ preprints.jmir.org | | 242
+ pressto.amu.edu.pl | | 260
+ pressto.amu.edu.pl | success | 232
+ publikationsserver.tu-braunschweig.de | | 15358
+ publikationsserver.tu-braunschweig.de | no-pdf-link | 15228
+ publons.com | | 2810
+ publons.com | redirect-loop | 2359
+ publons.com | no-pdf-link | 444
+ pubs.acs.org | | 1647
+ pubs.acs.org | blocked-cookie | 1553
+ pubs.rsc.org | | 765
+ pubs.rsc.org | redirect-loop | 486
+ pubs.rsc.org | spn2-wayback-error | 214
+ res.mdpi.com | | 3620
+ res.mdpi.com | success | 3591
+ revistas.usal.es | | 580
+ revistas.usal.es | success | 298
+ revues.imist.ma | | 229
+ rsdjournal.org | | 213
+ rsdjournal.org | success | 212
+ s3-eu-west-1.amazonaws.com | | 764
+ s3-eu-west-1.amazonaws.com | success | 763
+ s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | | 324
+ s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | success | 324
+ saspublishers.com | | 213
+ saspublishers.com | success | 213
+ scholarshare.temple.edu | | 524
+ scholarshare.temple.edu | success | 464
+ sol.sbc.org.br | | 268
+ sol.sbc.org.br | success | 264
+ statisticaldatasets.data-planet.com | | 442
+ statisticaldatasets.data-planet.com | no-pdf-link | 390
+ watermark.silverchair.com | | 521
+ watermark.silverchair.com | success | 514
+ www.ahajournals.org | | 1061
+ www.ahajournals.org | blocked-cookie | 1011
+ www.atlantis-press.com | | 308
+ www.atlantis-press.com | spn2-wayback-error | 228
+ www.beck-elibrary.de | | 1202
+ www.beck-elibrary.de | no-pdf-link | 953
+ www.beck-elibrary.de | bad-redirect | 249
+ www.cairn.info | | 255
+ www.cairn.info | redirect-loop | 208
+ www.cambridge.org | | 2061
+ www.cambridge.org | no-pdf-link | 727
+ www.cambridge.org | success | 485
+ www.cambridge.org | link-loop | 388
+ www.cambridge.org | bad-redirect | 252
+ www.confer.cz | | 227
+ www.confer.cz | no-pdf-link | 226
+ www.dbpia.co.kr | | 773
+ www.dbpia.co.kr | no-pdf-link | 679
+ www.degruyter.com | | 17046
+ www.degruyter.com | link-loop | 14202
+ www.degruyter.com | success | 2201
+ www.degruyter.com | not-found | 235
+ www.dovepress.com | | 316
+ www.dovepress.com | success | 267
+ www.e-manuscripta.ch | | 384
+ www.e-manuscripta.ch | no-pdf-link | 383
+ www.e-periodica.ch | | 358
+ www.e-periodica.ch | no-pdf-link | 355
+ www.e-rara.ch | no-pdf-link | 279
+ www.e-rara.ch | | 279
+ www.e3s-conferences.org | | 426
+ www.e3s-conferences.org | success | 419
+ www.elibrary.ru | | 303
+ www.elibrary.ru | no-pdf-link | 301
+ www.emerald.com | | 943
+ www.emerald.com | no-pdf-link | 933
+ www.etasr.com | | 466
+ www.etasr.com | success | 466
+ www.eurekaselect.com | | 345
+ www.eurekaselect.com | no-pdf-link | 321
+ www.europeanproceedings.com | | 218
+ www.europeanproceedings.com | success | 218
+ www.finersistemas.com | success | 397
+ www.finersistemas.com | | 397
+ www.humankineticslibrary.com | no-pdf-link | 321
+ www.humankineticslibrary.com | | 321
+ www.ijcmas.com | | 251
+ www.ijcmas.com | no-pdf-link | 248
+ www.inderscience.com | | 524
+ www.inderscience.com | no-pdf-link | 501
+ www.ingentaconnect.com | | 366
+ www.ingentaconnect.com | no-pdf-link | 349
+ www.jstage.jst.go.jp | | 1591
+ www.jstage.jst.go.jp | success | 862
+ www.jstage.jst.go.jp | no-pdf-link | 567
+ www.jstor.org | | 351
+ www.karger.com | | 224
+ www.liebertpub.com | | 236
+ www.liebertpub.com | blocked-cookie | 228
+ www.mdpi.com | | 694
+ www.mdpi.com | terminal-bad-status | 480
+ www.medlit.ru | | 458
+ www.medlit.ru | redirect-loop | 366
+ www.morressier.com | | 285
+ www.morressier.com | no-pdf-link | 253
+ www.njca.info | | 223
+ www.njca.info | remote-server-error | 222
+ www.nomos-elibrary.de | | 913
+ www.nomos-elibrary.de | no-pdf-link | 379
+ www.nomos-elibrary.de | bad-redirect | 265
+ www.nomos-elibrary.de | link-loop | 236
+ www.onepetro.org | | 895
+ www.onepetro.org | redirect-loop | 853
+ www.osti.gov | | 212
+ www.persee.fr | | 232
+ www.persee.fr | terminal-bad-status | 213
+ www.repository.cam.ac.uk | | 439
+ www.research-collection.ethz.ch | | 312
+ www.research-collection.ethz.ch | terminal-bad-status | 310
+ www.revistas.ufg.br | | 212
+ www.schoeningh.de | | 371
+ www.schoeningh.de | link-loop | 366
+ www.scialert.net | | 276
+ www.scialert.net | redirect-loop | 254
+ www.scielo.br | | 644
+ www.scielo.br | success | 624
+ www.sciencedirect.com | | 6523
+ www.sciencedirect.com | no-pdf-link | 4668
+ www.sciencedirect.com | terminal-bad-status | 1737
+ www.scitepress.org | no-pdf-link | 397
+ www.scitepress.org | | 397
+ www.tandfonline.com | | 3448
+ www.tandfonline.com | blocked-cookie | 2446
+ www.tandfonline.com | terminal-bad-status | 714
+ www.taylorfrancis.com | | 21292
+ www.taylorfrancis.com | link-loop | 18648
+ www.taylorfrancis.com | forbidden | 2022
+ www.taylorfrancis.com | terminal-bad-status | 518
+ www.thieme-connect.de | | 513
+ www.thieme-connect.de | not-found | 292
+ www.thieme-connect.de | redirect-loop | 213
+ www.whateveryoneneedstoknow.com | | 1174
+ www.whateveryoneneedstoknow.com | redirect-loop | 1163
+ www.worldscientific.com | | 293
+ www.worldscientific.com | blocked-cookie | 240
+ www.zora.uzh.ch | | 290
+ www.zora.uzh.ch | redirect-loop | 278
+ zenodo.org | | 22202
+ zenodo.org | terminal-bad-status | 12158
+ zenodo.org | success | 4923
+ zenodo.org | no-pdf-link | 4788
+ | | 280719
+ | success | 85143
+ | no-pdf-link | 61335
+ | link-loop | 48566
+ | redirect-loop | 26845
+ | terminal-bad-status | 23955
+ | spn2-wayback-error | 7920
+ | spn2-error:too-many-redirects | 7175
+ | blocked-cookie | 6980
+ | forbidden | 2912
+ | bad-redirect | 2666
+ | spn2-error | 1943
+ | not-found | 1762
+ | spn2-cdx-lookup-failure | 1376
+ | wrong-mimetype | 467
+ | remote-server-error | 388
+ | spn2-error:proxy-error | 295
+ | no-capture | 262
+(304 rows)
diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000..a9f5260
--- /dev/null
+++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000..d68895b
--- /dev/null
+++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+ EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+ FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+ IF (
+ NEW IS DISTINCT FROM OLD AND
+ NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+ ) THEN
+ NEW.updated_at := current_timestamp;
+ END IF;
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql
new file mode 100644
index 0000000..a085480
--- /dev/null
+++ b/sql/migrations/2019-12-19-060141_init/down.sql
@@ -0,0 +1,8 @@
+
+DROP TABLE IF NOT EXISTS cdx;
+DROP TABLE IF NOT EXISTS file_meta;
+DROP TABLE IF NOT EXISTS fatcat_file;
+DROP TABLE IF NOT EXISTS petabox;
+DROP TABLE IF NOT EXISTS grobid;
+DROP TABLE IF NOT EXISTS ingest_request;
+DROP TABLE IF NOT EXISTS shadow;
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
new file mode 100644
index 0000000..10a5183
--- /dev/null
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -0,0 +1,184 @@
+
+-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
+-- records are implied to be 200 status (or 226 for ftp); either direct hits or
+-- revisits
+-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
+-- datetimes. import scripts should take efforts to reduce this sort of
+-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
+-- all ingest result url/dt pairs should be included though.
+-- any mimetype is allowed, but presumption should be that actual body is full
+-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
+-- only a part of work). URLs that are parts of a fileset are allowed.
+CREATE TABLE IF NOT EXISTS cdx (
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ -- sha1hex/cdx_sha1hex difference is intended to help with difference between
+ -- CDX hash (which is transport encoded body) vs. actual body. Probably need to
+ -- include both for all records?
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ -- TODO: enforce that only paths with '/' (item+file) should be included?
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_csize BIGINT,
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ PRIMARY KEY(url, datetime)
+);
+CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+-- TODO: remove this index? not currently used
+CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+
+-- TODO: require all fields. if mimetype unknown, should be octet-stream
+CREATE TABLE IF NOT EXISTS file_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
+
+CREATE TABLE IF NOT EXISTS fatcat_file (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ file_ident TEXT CHECK (octet_length(file_ident) = 26),
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
+);
+
+CREATE TABLE IF NOT EXISTS petabox (
+ item TEXT NOT NULL CHECK (octet_length(item) >= 1),
+ path TEXT NOT NULL CHECK (octet_length(path) >= 1),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ PRIMARY KEY(item, path)
+);
+CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
+
+CREATE TABLE IF NOT EXISTS grobid (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+ fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
+ -- extracted basic biblio metadata:
+ -- title
+ -- authors[]
+ -- full/display
+ -- given_name
+ -- surname
+ -- affiliation
+ -- year
+ -- journal_issn
+ -- journal_name
+ -- refs_count
+ metadata JSONB
+);
+-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
+
+CREATE TABLE IF NOT EXISTS pdftrio (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
+ models_date DATE,
+ ensemble_score REAL,
+ bert_score REAL,
+ linear_score REAL,
+ image_score REAL
+);
+
+CREATE TABLE IF NOT EXISTS pdf_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ has_page0_thumbnail BOOLEAN NOT NULL,
+ page_count INT CHECK (page_count >= 0),
+ word_count INT CHECK (word_count >= 0),
+ page0_height REAL CHECK (page0_height >= 0),
+ page0_width REAL CHECK (page0_width >= 0),
+ permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
+ metadata JSONB
+ -- maybe some analysis of available fields?
+ -- metadata JSON fields:
+ -- title
+ -- subject
+ -- author
+ -- creator
+ -- producer
+ -- CrossMarkDomains
+ -- doi
+ -- form
+ -- encrypted
+);
+
+CREATE TABLE IF NOT EXISTS html_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ scope TEXT CHECK (octet_length(status) >= 1),
+ has_teixml BOOLEAN NOT NULL,
+ has_thumbnail BOOLEAN NOT NULL,
+ word_count INT CHECK (word_count >= 0),
+ biblio JSONB,
+ resources JSONB
+ -- biblio JSON fields are similar to fatcat release schema
+ -- resources JSON object is a list of objects with keys like webcapture CDX schema
+);
+
+CREATE TABLE IF NOT EXISTS ingest_request (
+ link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
+ link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+
+ ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1),
+ created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ release_stage TEXT CHECK (octet_length(release_stage) >= 1),
+ request JSONB,
+ -- request isn't required, but can stash extra fields there for import, eg:
+ -- ext_ids (source/source_id sometimes enough)
+ -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
+ -- edit_extra
+ -- ingest type can be: pdf, xml, html
+
+ PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
+);
+CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+
+CREATE TABLE IF NOT EXISTS ingest_file_result (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(terminal_url) >= 1),
+ terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
+ terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
+ terminal_status_code INT,
+ terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40),
+
+ PRIMARY KEY (ingest_type, base_url)
+);
+CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
+CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
+
+CREATE TABLE IF NOT EXISTS shadow (
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(shadow_corpus, shadow_id)
+);
+CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
+
+CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+);
diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md
new file mode 100644
index 0000000..0859e79
--- /dev/null
+++ b/sql/monitoring_queries.md
@@ -0,0 +1,202 @@
+
+## fatcat-changelog pipeline
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Summary of significant domains and status, past 7 days:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+Summary of DOI prefix and status, past 7 days:
+
+ SELECT doi_prefix, status, count
+ FROM (
+ SELECT doi_prefix, status, COUNT((doi_prefix, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_request.link_source_id FROM '(10\.[^/]*)/.*') AS doi_prefix
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_request.link_source = 'doi'
+ ) t1
+ WHERE t1.doi_prefix != ''
+ GROUP BY CUBE (doi_prefix, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY doi_prefix ASC , count DESC;
+
+
+Throughput per day, and success, for past 30 days:
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
+ ORDER BY date(ingest_request.created) DESC;
+
+## fatcat-ingest
+
+Broken domains, past 7 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '7 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Throughput per day, and success, for past 7 days:
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_file_result.updated),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '7 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated)
+ ORDER BY date(ingest_file_result.updated) DESC;
+
+Overall status, updated requests past 3 days:
+
+ SELECT ingest_request.ingest_type,
+ ingest_file_result.status,
+ COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_file_result.updated >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '48 hour'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.status
+ ORDER BY COUNT(*) DESC;
+
+## savepapernow and fatcat-ingest recent status
+
+Specific recent ingests (for debugging):
+
+ -- for record layout: \x
+ SELECT
+ ingest_file_result.status as status,
+ ingest_request.ingest_type as ingest_type,
+ ingest_request.ingest_request_source as source,
+ ingest_request.link_source_id as source_id,
+ ingest_request.base_url as base_url,
+ ingest_file_result.terminal_dt as dt,
+ ingest_file_result.terminal_status_code as status_code,
+ ingest_file_result.terminal_sha1hex as sha1hex,
+ grobid.status as grobid_status
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid
+ ON ingest_file_result.terminal_sha1hex = grobid.sha1hex
+ WHERE
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ -- AND ingest_request.ingest_type = 'pdf'
+ -- AND ingest_request.ingest_type = 'html'
+ AND (
+ ingest_request.ingest_request_source = 'savepapernow-web'
+ -- OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ )
+ ORDER BY ingest_file_result.updated DESC
+ LIMIT 100;
+
diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md
new file mode 100644
index 0000000..06f718c
--- /dev/null
+++ b/sql/pdftrio_queries.md
@@ -0,0 +1,65 @@
+
+## Counts / Status
+
+ SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code;
+
+ # NOTE: I earlier deleted a large fraction of non-200 status codes, so
+ # these aren't representative
+ status_code | count
+ -------------+---------
+ -4 | 16
+ -2 | 26
+ 200 | 1117501
+ 400 | 2695
+ (4 rows)
+
+
+ SELECT status, COUNT(*) FROM pdftrio GROUP BY status;
+
+ status | count
+ ---------------+---------
+ error | 2696
+ error-connect | 26
+ error-timeout | 16
+ success | 1118252
+ (4 rows)
+
+ SELECT
+ COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count,
+ COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count,
+ COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count,
+ COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count
+ FROM pdftrio;
+
+
+ ensemble_count | linear_count | bert_count | image_count
+ ----------------+--------------+------------+-------------
+ 1120100 | 976271 | 66209 | 143829
+ (1 row)
+
+## Histograms
+
+ SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND ensemble_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND bert_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND linear_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND image_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
diff --git a/sql/random_queries.md b/sql/random_queries.md
new file mode 100644
index 0000000..572b4f9
--- /dev/null
+++ b/sql/random_queries.md
@@ -0,0 +1,193 @@
+
+Basic stats (2019-09-23):
+
+ SELECT COUNT(*) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex);
+ => 28,023,760
+ => Time: 253897.213 ms (04:13.897)
+
+ SELECT COUNT(DISTINCT sha1hex) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex);
+ => 22,816,087
+ => Time: 287097.944 ms (04:47.098)
+
+ SELECT COUNT(*) FROM grobid.
+ => 56,196,992
+
+ SELECT COUNT(DISTINCT sha1hex) FROM cdx;
+ => 64,348,277
+ => Time: 572383.931 ms (09:32.384)
+
+ SELECT COUNT(*) FROM cdx;
+ => 74,796,777
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
+ => Time: 189067.335 ms (03:09.067)
+
+ mimetype | count
+ ------------------------+----------
+ application/pdf | 51049905
+ text/html | 24841846
+ text/xml | 524682
+ application/postscript | 81009
+ (4 rows)
+
+Time: 189067.335 ms (03:09.067)
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY count(*) DESC;
+
+ status_code | count
+ -------------+----------
+ 200 | 56196992
+
+ compare with older sandcrawler/output-prod/2019-05-28-1920.35-statuscodecount:
+
+ 200 49567139
+ 400 3464503
+ 409 691917
+ 500 247028
+ 503 123
+
+ SELECT row_to_json(cdx) FROM cdx LIMIT 5;
+
+ SELECT row_to_json(r) FROM (
+ SELECT url, datetime FROM cdx
+ ) r
+ LIMIT 5;
+
+More stats (2019-12-27):
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 20;
+
+ SELECT SUM(size_bytes) FROM file_meta;
+
+"Last 24 hour progress":
+
+ # "problem domains" and statuses
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+ # "what type of errors"
+ SELECT ingest_type, status, COUNT(*)
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY ingest_type, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ # "throughput per day for last N days"
+ SELECT ingest_type,
+ date(updated),
+ COUNT(*) as total,
+ COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 month'::INTERVAL
+ GROUP BY ingest_type, date(updated)
+ ORDER BY date(updated) DESC;
+
+## Parse URLs
+
+One approach is to do regexes, something like:
+
+ SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name;
+
+Eg:
+
+ SELECT DISTINCT(domain), COUNT(domain)
+ FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+Or:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+Can also do some quick lookups for a specific domain and protocol like:
+
+ SELECT *
+ FROM ingest_file_result
+ WHERE terminal_url LIKE 'https://insights.ovid.com/%'
+ LIMIT 10;
+
+For a given DOI prefix:
+
+ SELECT *
+ FROM ingest_file_result
+ WHERE base_url LIKE 'https://doi.org/10.17223/a%'
+ AND status = 'no-pdf-link'
+ LIMIT 10;
+
+ SELECT status, count(*)
+ FROM ingest_file_result
+ WHERE base_url LIKE 'https://doi.org/10.17223/%'
+ GROUP BY status
+ ORDER BY count(*) DESC;
+
+## Bulk Ingest
+
+Show bulk ingest status on links *added* in the past week:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Top *successful* domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Summarize non-success domains for the same:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 20;
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
new file mode 100755
index 0000000..20fd82b
--- /dev/null
+++ b/sql/reingest_quarterly.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_quarterly_current.json
+
+cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
+ | shuf \
+ | head -n120000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
new file mode 100755
index 0000000..6fb1e4b
--- /dev/null
+++ b/sql/reingest_spn.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_spn.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \
+ > /srv/sandcrawler/tasks/reingest_spn.json
+
+cat /srv/sandcrawler/tasks/reingest_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
new file mode 100755
index 0000000..04ce39d
--- /dev/null
+++ b/sql/reingest_weekly.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_weekly_current.json
+
+cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql
new file mode 120000
index 0000000..a3756d4
--- /dev/null
+++ b/sql/sandcrawler_schema.sql
@@ -0,0 +1 @@
+migrations/2019-12-19-060141_init/up.sql \ No newline at end of file
diff --git a/sql/stats/2020-01-13_stats.txt b/sql/stats/2020-01-13_stats.txt
new file mode 100644
index 0000000..444e448
--- /dev/null
+++ b/sql/stats/2020-01-13_stats.txt
@@ -0,0 +1,190 @@
+
+## SQL Table Sizes
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid" | 38 GB | 7076 MB | 45 GB
+ "public"."file_meta" | 23 GB | 11 GB | 34 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."ingest_request" | 363 MB | 625 MB | 988 MB
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+
+ total_count | total_size
+ -------------+-----------------
+ 118823340 | 140917467253923
+ (1 row)
+
+ # 118,823,340 => 118 million
+ # 140,917,467,253,923 => ~141 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 117185567
+ | 1509149
+ application/octet-stream | 87783
+ text/html | 9901
+ application/postscript | 3781
+ application/vnd.ms-powerpoint | 1421
+ text/plain | 1151
+ application/xml | 427
+ application/gzip | 414
+ application/msword | 314
+ (10 rows)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 96141851 | 110030179
+ (1 row)
+
+ # 96,141,851
+ # 110,030,179
+
+Top mimetypes (not unique by sha1):
+
+ mimetype | count
+ ------------------------+----------
+ application/pdf | 84582642
+ text/html | 24841846
+ text/xml | 524682
+ application/postscript | 81009
+ (4 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 13675190 | 59919772
+
+ # 13,675,190
+ # 59,919,772
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 57382904
+ 500 | 2536862
+ 503 | 6
+ (3 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 41699385
+ | 15683279
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+ # 2,868,825
+ # 2,887,834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 2816171
+ pdf | arxiv | 154448
+ pdf | spn | 55
+ pdf | pubmed | 2
+ (4 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ (0 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+--------+----------
+ pdf | doi | 2816171 | 289199 | 0.103
+ pdf | arxiv | 154448 | 41105 | 0.266
+ pdf | spn | 55 | 46 | 0.836
+ pdf | pubmed | 2 | 0 | 0.000
+ (4 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+---------------------+---------
+ pdf | no-pdf-link | 2213720
+ pdf | success | 330492
+ pdf | spn-remote-error | 182157
+ pdf | spn-error | 141222
+ pdf | cdx-error | 83131
+ pdf | link-loop | 11350
+ pdf | other-mimetype | 6089
+ pdf | null-body | 1980
+ pdf | terminal-bad-status | 583
+ pdf | wayback-error | 381
+ (10 rows)
+
diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt
new file mode 100644
index 0000000..6bd43ea
--- /dev/null
+++ b/sql/stats/2020-01-31_supplement.txt
@@ -0,0 +1,42 @@
+
+How many file_meta still missing core metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 1,130,915
+
+Great! Not many.
+
+And are in petabox?
+
+ SELECT COUNT(*)
+ FROM file_meta
+ LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
+ WHERE file_meta.sha256hex IS NULL
+ AND file_meta.sha1hex IS NOT NULL;
+ => 1,149,194
+
+Almost all; maybe just some CDX fetch failures or something in there. So,
+should run these on, eg, grobid2-vm.
+
+ COPY (
+ SELECT row_to_json(petabox.*)
+ FROM file_meta
+ LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
+ WHERE file_meta.sha256hex IS NULL
+ AND file_meta.sha1hex IS NOT NULL
+ ) TO '/grande/snapshots/dump_grobid_petabox_todo.json';
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a
+couple million files):
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | count
+ -------------+---------
+ 5072452 | 4130405
+
diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt
new file mode 100644
index 0000000..e7a00e8
--- /dev/null
+++ b/sql/stats/2020-02-24_stats.txt
@@ -0,0 +1,482 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ Size: 271.83G
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB
+ "public"."grobid" | 47 GB | 7213 MB | 54 GB
+ "public"."file_meta" | 26 GB | 12 GB | 38 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB
+ "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."pdftrio" | 78 MB | 64 MB | 142 MB
+ (10 rows)
+
+
+## File Metadata
+
+(skipping, no update)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 15,632,810 | 76,555,791
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 70656028
+ 500 | 5896836
+ -4 | 2295
+ 503 | 111
+ (4 rows)
+
+ What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 56001631
+ | 14654496
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2,868,825 | 2,887,834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591633
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ pdf | pubmed | 2
+ (6 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | | 2943896
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | | 629719
+ pdf | doi | fatcat-changelog | 129932
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | arxiv | fatcat-ingest | 998
+ pdf | spn | | 64
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | spn | savepapernow-web | 18
+ pdf | pubmed | | 2
+ pdf | doi | savepapernow-web | 1
+ (14 rows)
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ => UPDATE 2943896
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ => UPDATE 64
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ => UPDATE 629719
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ => UPDATE 1454
+
+ SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed';
+ DELETE FROM ingest_request WHERE link_source = 'pubmed';
+ => DELETE 2
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591637
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ (5 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | fatcat-changelog | 3073828
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630717
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | fatcat-ingest | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | spn | savepapernow-web | 82
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 1
+ (10 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ none?
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+---------+----------
+ pdf | doi | 6591637 | 1622702 | 0.246
+ pdf | pmc | 2030279 | 1241836 | 0.612
+ pdf | arxiv | 630743 | 500620 | 0.794
+ pdf | unpaywall | 1400 | 851 | 0.608
+ pdf | spn | 82 | 62 | 0.756
+ (5 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+---------
+ pdf | success | 3366189
+ pdf | no-pdf-link | 2902620
+ pdf | no-capture | 1672025
+ pdf | redirect-loop | 388844
+ pdf | cdx-error | 272780
+ pdf | terminal-bad-status | 171878
+ pdf | spn-remote-error | 163843
+ pdf | spn-error | 108070
+ pdf | null-body | 66778
+ pdf | link-loop | 43403
+ pdf | skip-url-blocklist | 34705
+ pdf | wrong-mimetype | 31343
+ pdf | wayback-error | 13012
+ pdf | spn2-cdx-lookup-failure | 6100
+ pdf | gateway-timeout | 5633
+ pdf | other-mimetype | 5114
+ pdf | spn2-error:proxy-error | 538
+ pdf | spn2-error:job-failed | 470
+ pdf | petabox-error | 415
+ pdf | spn2-error:browser-running-error | 136
+ pdf | spn2-error | 127
+ pdf | spn2-error:soft-time-limit-exceeded | 71
+ pdf | bad-redirect | 39
+ pdf | spn2-error:unknown | 30
+ pdf | spn2-error:browsing-timeout | 25
+ pdf | pending | 3
+ pdf | invalid-host-resolution | 1
+ (27 rows)
+
+
+## Fatcat Files
+
+(skipping, no update)
+
+## Recent Success/Failure of Ingest by Domain
+
+NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a
+small fraction of this is from changelog.
+
+ # "problem domains" and statuses
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------------+----------------+-------
+ linkinghub.elsevier.com | no-capture | 2579
+ www.mdpi.com | wrong-mimetype | 1313
+ onlinelibrary.wiley.com | no-pdf-link | 785
+ americanarchivist.org | no-pdf-link | 756
+ journals.sagepub.com | redirect-loop | 503
+ link.springer.com | redirect-loop | 432
+ iopscience.iop.org | no-capture | 392
+ www.tandfonline.com | no-pdf-link | 389
+ pubs.rsc.org | no-capture | 361
+ www.persee.fr | no-capture | 344
+ (10 rows)
+
+
+ # "what type of errors"
+ SELECT ingest_type, status, COUNT(*)
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY ingest_type, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | success | 40578
+ pdf | cdx-error | 14982
+ pdf | no-capture | 7747
+ pdf | no-pdf-link | 7111
+ pdf | redirect-loop | 3265
+ pdf | wrong-mimetype | 1629
+ pdf | spn2-cdx-lookup-failure | 657
+ pdf | link-loop | 538
+ pdf | null-body | 517
+ pdf | terminal-bad-status | 400
+ pdf | wayback-error | 79
+ pdf | spn2-error:job-failed | 53
+ pdf | gateway-timeout | 38
+ pdf | spn2-error:soft-time-limit-exceeded | 7
+ pdf | spn2-error | 6
+ pdf | petabox-error | 5
+ pdf | spn2-error:browsing-timeout | 4
+ pdf | spn2-error:unknown | 2
+ pdf | bad-redirect | 1
+ pdf | pending | 1
+ (20 rows)
+
+ # "throughput per day for last N days"
+ SELECT ingest_type,
+ date(updated),
+ COUNT(*) as total,
+ COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 month'::INTERVAL
+ GROUP BY ingest_type, date(updated)
+ ORDER BY date(updated) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+---------+---------
+ pdf | 2020-02-25 | 32660 | 14322
+ pdf | 2020-02-24 | 44967 | 26263
+ pdf | 2020-02-23 | 58795 | 18874
+ pdf | 2020-02-22 | 844249 | 272606
+ pdf | 2020-02-21 | 1287378 | 433487
+ pdf | 2020-02-20 | 1455943 | 492408
+ pdf | 2020-02-19 | 21453 | 7529
+ pdf | 2020-02-18 | 5863 | 2926
+ pdf | 2020-02-17 | 3737 | 970
+ pdf | 2020-02-16 | 13779 | 4862
+ pdf | 2020-02-15 | 1021020 | 623020
+ pdf | 2020-02-14 | 1036036 | 632830
+ pdf | 2020-02-13 | 13503 | 5824
+ pdf | 2020-02-12 | 20078 | 11422
+ pdf | 2020-02-11 | 13499 | 6781
+ pdf | 2020-02-10 | 2275 | 961
+ pdf | 2020-02-09 | 3231 | 1494
+ pdf | 2020-02-08 | 8967 | 4400
+ pdf | 2020-02-07 | 7022 | 2430
+ pdf | 2020-02-06 | 1291 | 516
+ pdf | 2020-02-05 | 8586 | 6596
+ pdf | 2020-02-04 | 3681 | 3593
+ pdf | 2020-02-03 | 284 | 284
+ pdf | 2020-02-02 | 480 | 480
+ pdf | 2020-02-01 | 489 | 336
+ pdf | 2020-01-31 | 1187 | 1130
+ pdf | 2020-01-30 | 1613 | 1288
+ pdf | 2020-01-29 | 947 | 279
+ pdf | 2020-01-28 | 667 | 323
+ (29 rows)
+
+Top "no-capture" domains (will need to re-ingest using live tool):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------+------------+--------
+ linkinghub.elsevier.com | no-capture | 320065
+ iopscience.iop.org | no-capture | 46858
+ pubs.rsc.org | no-capture | 43331
+ www.persee.fr | no-capture | 38971
+ www.doiserbia.nb.rs | no-capture | 27112
+ academic.oup.com | no-capture | 18877
+ www.osapublishing.org | no-capture | 17113
+ osf.io | no-capture | 16978
+ scripts.iucr.org | no-capture | 14844
+ www.degruyter.com | no-capture | 8093
+ mab-online.nl | no-capture | 6603
+ insights.ovid.com | no-capture | 6457
+ ir.lib.uth.gr | no-capture | 3625
+ www.sciencedirect.com | no-capture | 3244
+ www.tandfonline.com | no-capture | 3201
+ www.ccsenet.org | no-capture | 2849
+ www.intechopen.com | no-capture | 2813
+ primary-hospital-care.ch | no-capture | 2774
+ www.nature.com | no-capture | 2484
+ www.indianjournals.com | no-capture | 2432
+ journals.aps.org | no-capture | 2197
+ journals.sagepub.com | no-capture | 2064
+ www.episodes.org | no-capture | 1805
+ periodicos.uninove.br | no-capture | 1692
+ escholarship.org | no-capture | 1666
+ (25 rows)
+
+Top "no-pdf-link" domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-pdf-link'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -----------------------------+-------------+--------
+ plutof.ut.ee | no-pdf-link | 685315
+ www.gbif.org | no-pdf-link | 670647
+ doi.pangaea.de | no-pdf-link | 301984
+ www.plate-archive.org | no-pdf-link | 209218
+ onlinelibrary.wiley.com | no-pdf-link | 84890
+ figshare.com | no-pdf-link | 72892
+ zenodo.org | no-pdf-link | 45768
+ www.tandfonline.com | no-pdf-link | 43848
+ data.mendeley.com | no-pdf-link | 42367
+ springernature.figshare.com | no-pdf-link | 35941
+ dhz.uni-passau.de | no-pdf-link | 29187
+ www.frontiersin.org | no-pdf-link | 17925
+ digital.ucd.ie | no-pdf-link | 16769
+ mr.crossref.org | no-pdf-link | 14999
+ journals.lww.com | no-pdf-link | 12122
+ musewide.aip.de | no-pdf-link | 10854
+ datadryad.org | no-pdf-link | 10686
+ www.jstor.org | no-pdf-link | 9159
+ koreascience.or.kr | no-pdf-link | 9067
+ easy.dans.knaw.nl | no-pdf-link | 8264
+ scielo.conicyt.cl | no-pdf-link | 8069
+ www.degruyter.com | no-pdf-link | 7989
+ www.kci.go.kr | no-pdf-link | 6990
+ www.m-hikari.com | no-pdf-link | 6941
+ cshprotocols.cshlp.org | no-pdf-link | 6553
+ (25 rows)
+
+Top block-ish domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status')
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ ---------------------------------+---------------------+-------
+ journals.openedition.org | redirect-loop | 30395
+ ieeexplore.ieee.org | redirect-loop | 28926
+ www.degruyter.com | redirect-loop | 18891
+ www.cairn.info | link-loop | 8919
+ www.frontiersin.org | terminal-bad-status | 6786
+ projecteuclid.org | link-loop | 6098
+ www.mdpi.com | terminal-bad-status | 5189
+ medicalforum.ch | terminal-bad-status | 4596
+ jrnl.nau.edu.ua | link-loop | 4238
+ www.revistas.unam.mx | link-loop | 3926
+ journals.aps.org | redirect-loop | 3696
+ www.ijcseonline.org | redirect-loop | 3567
+ www.researchsquare.com | terminal-bad-status | 3453
+ www.persee.fr | terminal-bad-status | 3221
+ www.baltistica.lt | link-loop | 2098
+ osf.io | redirect-loop | 2004
+ seer.ufrgs.br | terminal-bad-status | 2002
+ jtd.amegroups.com | link-loop | 1738
+ www.hindawi.com | terminal-bad-status | 1613
+ linkinghub.elsevier.com | redirect-loop | 1612
+ www.scienceopen.com | terminal-bad-status | 1580
+ atm.amegroups.com | link-loop | 1571
+ scielo.conicyt.cl | terminal-bad-status | 1491
+ repozytorium.ur.edu.pl | redirect-loop | 1279
+ agupubs.onlinelibrary.wiley.com | link-loop | 1182
+ (25 rows)
+
diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt
new file mode 100644
index 0000000..55f0c1e
--- /dev/null
+++ b/sql/stats/2020-05-03_stats.txt
@@ -0,0 +1,418 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 41 GB | 82 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 59 GB | 7604 MB | 66 GB
+ "public"."file_meta" | 31 GB | 28 GB | 59 GB
+ "public"."ingest_request" | 19 GB | 20 GB | 39 GB
+ "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (10 rows)
+
+ Size: 383.93G
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 158059828 | 197346217653010
+ (1 row)
+
+ => 158 million, 197 terabytes
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 157805029
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6040
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1027125
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+
+ unique_sha1 | total
+ -------------+-----------
+ 92936564 | 111022039
+ (1 row)
+
+ => 110 million rows, 92.9 million files
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------------------------------------+-----------
+ application/pdf | 104178718
+ warc/revisit | 5274410
+ text/xml | 519042
+ text/html | 295523
+ application/octet-stream | 259681
+ unk | 138930
+ application/postscript | 81065
+ application/save | 80765
+ binary/octet-stream | 59804
+ application/x-download | 27083
+ text/plain | 26938
+ application/download | 25125
+ image/pdf | 16095
+ application/force-download | 9004
+ application/x-msdownload | 3711
+ application | 2934
+ application/x-octetstream | 2926
+ multipart/form-data | 2741
+ application/x-pdf | 2444
+ .pdf | 2368
+ application/binary | 1268
+ application/pdf' | 1192
+ pdf | 1113
+ file/unknown | 1086
+ application/unknown | 761
+ file | 753
+ application/blob | 670
+ application/octetstream | 657
+ text/pdf | 549
+ 0 | 417
+ ('application/pdf', | 349
+ application/http;msgtype=response | 251
+ application/doc | 180
+ [...] (wasn't LIMIT 25)
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+
+ unique_releases | total
+ -----------------+----------
+ 17455441 | 92707544
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057023
+ | 14638425
+ (2 rows)
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057074
+ | 3
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | unpaywall | 26244088
+ pdf | mag | 25596658
+ pdf | doi | 15652966
+ pdf | pmc | 2043646
+ pdf | arxiv | 721902
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 103
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | unpaywall | unpaywall | 26244088
+ pdf | mag | mag-corpus | 25596658
+ pdf | doi | fatcat-ingest | 8267308
+ pdf | doi | fatcat-changelog | 3869772
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630719
+ pdf | arxiv | fatcat-changelog | 91157
+ pdf | pmc | fatcat-ingest | 10195
+ pdf | pmc | fatcat-changelog | 4626
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 103
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 15
+ (15 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ pdf | mag | 47
+ pdf | unpaywall | 1
+ (2 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | unpaywall | 26244088 | 19968092 | 0.761
+ pdf | mag | 25596658 | 18712912 | 0.731
+ pdf | doi | 15653166 | 2878833 | 0.184
+ pdf | pmc | 2043646 | 1279529 | 0.626
+ pdf | arxiv | 721902 | 592394 | 0.821
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 103 | 82 | 0.796
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 37449502
+ pdf | no-pdf-link | 10908442
+ pdf | no-capture | 5643670
+ pdf | redirect-loop | 4823502
+ pdf | terminal-bad-status | 1715056
+ pdf | link-loop | 1425072
+ pdf | cdx-error | 535365
+ pdf | gateway-timeout | 267654
+ pdf | skip-url-blocklist | 220433
+ pdf | wrong-mimetype | 189804
+ pdf | spn2-cdx-lookup-failure | 103926
+ pdf | spn-error | 101777
+ pdf | wayback-error | 93517
+ pdf | null-body | 87279
+ pdf | invalid-host-resolution | 35305
+ pdf | spn-remote-error | 28888
+ pdf | petabox-error | 12406
+ pdf | spn2-error | 2905
+ pdf | spn2-error:job-failed | 2307
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 745
+ pdf | spn2-error:proxy-error | 438
+ pdf | spn2-error:invalid-url-syntax | 406
+ pdf | spn2-error:soft-time-limit-exceeded | 405
+ pdf | spn2-error:browser-running-error | 274
+ (25 rows)
+
+Failures by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | domain | status | count
+ -------------+---------------------------------------+---------------------+--------
+ pdf | ssl.fao.org | no-pdf-link | 862277
+ pdf | www.researchgate.net | redirect-loop | 749094
+ pdf | www.e-periodica.ch | no-pdf-link | 747370
+ pdf | ieeexplore.ieee.org | redirect-loop | 707482
+ pdf | plutof.ut.ee | no-pdf-link | 685341
+ pdf | www.gbif.org | no-pdf-link | 670905
+ pdf | dlc.library.columbia.edu | no-pdf-link | 508281
+ pdf | figshare.com | no-pdf-link | 400501
+ pdf | onlinelibrary.wiley.com | no-pdf-link | 399187
+ pdf | watermark.silverchair.com | terminal-bad-status | 357188
+ pdf | www.die-bonn.de | redirect-loop | 352903
+ pdf | academic.oup.com | no-pdf-link | 346828
+ pdf | iopscience.iop.org | terminal-bad-status | 345147
+ pdf | linkinghub.elsevier.com | no-capture | 328434
+ pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206
+ pdf | cyberleninka.ru | link-loop | 309525
+ pdf | www.tandfonline.com | no-pdf-link | 309146
+ pdf | dialnet.unirioja.es | terminal-bad-status | 307572
+ pdf | doi.pangaea.de | no-pdf-link | 304924
+ pdf | journals.sagepub.com | no-pdf-link | 285774
+ pdf | papers.ssrn.com | link-loop | 282415
+ pdf | dialnet.unirioja.es | redirect-loop | 274476
+ pdf | ieeexplore.ieee.org | link-loop | 273607
+ pdf | catalog.paradisec.org.au | redirect-loop | 234653
+ pdf | www.plate-archive.org | no-pdf-link | 209217
+ pdf | zenodo.org | no-pdf-link | 200078
+ pdf | zenodo.org | no-capture | 199025
+ pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084
+ pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039
+ pdf | validate.perfdrive.com | no-pdf-link | 180191
+ (30 rows)
+
+Success by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | domain | status | count
+ -------------+----------------------------+---------+---------
+ pdf | www.jstage.jst.go.jp | success | 2244620
+ pdf | europepmc.org | success | 1284770
+ pdf | link.springer.com | success | 1017998
+ pdf | www.scielo.br | success | 799577
+ pdf | arxiv.org | success | 592622
+ pdf | downloads.hindawi.com | success | 527278
+ pdf | res.mdpi.com | success | 501093
+ pdf | hal.archives-ouvertes.fr | success | 447877
+ pdf | digital.library.unt.edu | success | 404460
+ pdf | www.cambridge.org | success | 394666
+ pdf | dergipark.org.tr | success | 373706
+ pdf | journals.plos.org | success | 296994
+ pdf | watermark.silverchair.com | success | 275562
+ pdf | www.nature.com | success | 263836
+ pdf | cds.cern.ch | success | 223057
+ pdf | www.pnas.org | success | 220488
+ pdf | s3-eu-west-1.amazonaws.com | success | 214558
+ pdf | www.jbc.org | success | 205277
+ pdf | www.redalyc.org | success | 193591
+ pdf | iopscience.iop.org | success | 175796
+ pdf | apps.dtic.mil | success | 170589
+ pdf | zenodo.org | success | 167812
+ pdf | peerj.com | success | 155620
+ pdf | www.biorxiv.org | success | 149337
+ pdf | 210.101.116.28 | success | 145706
+ pdf | www.teses.usp.br | success | 145438
+ pdf | absimage.aps.org | success | 144400
+ pdf | hrcak.srce.hr | success | 134669
+ pdf | www.erudit.org | success | 131771
+ pdf | babel.hathitrust.org | success | 130645
+ (30 rows)
+
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ => NOT RUN, fatcat_file table is way out of date
+
diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt
new file mode 100644
index 0000000..d1993fc
--- /dev/null
+++ b/sql/stats/2020-07-23_stats.txt
@@ -0,0 +1,347 @@
+
+Summary:
+
+- very many more PDFs have been grobid-ed vs. pdf_meta-ed
+- about 1 million file_meta still have partial metadata (eg, no sha256)
+- database size still under 0.5 TByte
+- there are about a million CDX error ingest requests, and hundreds of
+ thousands of SPN errors which could be re-run
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 42 GB | 84 GB
+ "public"."ingest_request" | 34 GB | 39 GB | 73 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 61 GB | 7742 MB | 69 GB
+ "public"."file_meta" | 32 GB | 29 GB | 61 GB
+ "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (11 rows)
+
+ Size: 466.91G
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 161944425 | 204,402,677,360,189
+ (1 row)
+
+ # 161.9 mil; 204 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 161691608
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6036
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1015337
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 96537611 | 116281981
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------+-----------
+ application/pdf | 108706978
+ warc/revisit | 5912013
+ text/xml | 519042
+ application/octet-stream | 307782
+ text/html | 295634
+ unk | 156937
+ application/postscript | 81079
+ application/save | 80871
+ binary/octet-stream | 61263
+ text/plain | 31495
+ application/x-download | 30511
+ application/download | 26716
+ image/pdf | 26357
+ application/force-download | 10541
+ multipart/form-data | 5551
+ application/x-msdownload | 3724
+ application/x-octetstream | 3216
+ application | 3171
+ .pdf | 2728
+ application/x-pdf | 2563
+ application/binary | 1306
+ application/pdf' | 1192
+ pdf | 1180
+ [...]
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+
+ total_files | unique_releases
+ -------------+-----------------
+ 95557413 | 18020570
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+
+ status_code | count
+ -------------+----------
+ 200 | 88450610
+ 500 | 7101098
+ -4 | 4133
+ 503 | 110
+
+ SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10;
+
+ status | count
+ ----------------+----------
+ success | 73814297
+ | 14638412
+ error | 7101308
+ error-timeout | 4133
+ bad-grobid-xml | 6
+ (5 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 73813427
+ | 14638425
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 27653003
+ pdf | doi | 16589669
+ pdf | pmc | 2231113
+ pdf | arxiv | 794693
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 148
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 27653003
+ pdf | doi | fatcat-ingest | 8320832
+ pdf | doi | fatcat-changelog | 4752956
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630750
+ pdf | pmc | fatcat-ingest | 194781
+ pdf | arxiv | fatcat-changelog | 163924
+ pdf | pmc | fatcat-changelog | 7507
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 148
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 19
+ pdf | arxiv | savepapernow-web | 2
+
+Uncrawled requests by source:
+
+ # TODO: verify this? seems wrong
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | mag | 4097008
+ pdf | oai | 15287
+ pdf | unpaywall | 1
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 5346057 | 0.104
+ pdf | mag | 35015357 | 22199271 | 0.634
+ pdf | unpaywall | 27653003 | 22067338 | 0.798
+ pdf | doi | 16589700 | 3207661 | 0.193
+ pdf | pmc | 2231113 | 1696976 | 0.761
+ pdf | arxiv | 794727 | 645607 | 0.812
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 148 | 114 | 0.770
+ (9 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 46465271
+ pdf | no-capture | 46115869
+ pdf | no-pdf-link | 13877460
+ pdf | redirect-loop | 5943956
+ pdf | terminal-bad-status | 1962754
+ pdf | link-loop | 1630078
+ pdf | cdx-error | 1014409
+ pdf | gateway-timeout | 459340
+ pdf | wrong-mimetype | 321774
+ pdf | skip-url-blocklist | 220629
+ pdf | wayback-error | 220453
+ pdf | spn2-cdx-lookup-failure | 143963
+ pdf | null-body | 113384
+ pdf | spn-error | 101773
+ pdf | invalid-host-resolution | 37367
+ pdf | spn-remote-error | 28886
+ pdf | petabox-error | 22997
+ pdf | spn2-error | 16342
+ pdf | spn2-error:job-failed | 5017
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 746
+ pdf | spn2-error:soft-time-limit-exceeded | 632
+ pdf | spn2-error:proxy-error | 437
+ pdf | spn2-error:invalid-url-syntax | 417
+ pdf | timeout | 417
+ (25 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+
+ total_count | release_count
+ -------------+---------------
+ 5862666 | 4728824
+ (1 row)
+
+## PDF Meta
+
+Total rows:
+
+ SELECT COUNT(*) as total_count FROM pdf_meta;
+
+
+ total_count
+ -------------
+ 21961874
+
+By status:
+
+ SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC;
+
+ status | count
+ ----------------+----------
+ success | 21788507
+ parse-error | 78196
+ text-too-large | 60595
+ not-pdf | 31679
+ error-wayback | 2639
+ bad-unicode | 251
+ bad-pdf | 6
+ empty-blob | 1
+ (8 rows)
+
diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt
new file mode 100644
index 0000000..3bc27b0
--- /dev/null
+++ b/sql/stats/2020-09-14_stats.txt
@@ -0,0 +1,340 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 44 GB | 45 GB | 89 GB
+ "public"."grobid" | 66 GB | 8127 MB | 74 GB
+ "public"."ingest_request" | 34 GB | 40 GB | 73 GB
+ "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."file_meta" | 33 GB | 30 GB | 63 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (11 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 167021210 | 221982345333674
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 166765214
+ application/octet-stream | 155517
+ application/xml | 42170
+ text/html | 18708
+ text/plain | 15990
+ application/gzip | 6491
+ | 6036
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62960
+ (1 row)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 102123051 | 126550160
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 116885565
+ warc/revisit | 7951816
+ text/xml | 519042
+ application/octet-stream | 327639
+ text/html | 295725
+ unk | 172491
+ application/postscript | 81095
+ application/save | 80900
+ binary/octet-stream | 61783
+ text/plain | 33684
+ image/pdf | 32856
+ application/x-download | 32418
+ application/download | 27672
+ application/force-download | 10892
+ multipart/form-data | 5750
+ application/x-msdownload | 3832
+ application/x-octetstream | 3516
+ application | 3499
+ .pdf | 3038
+ application/x-pdf | 2701
+ application/binary | 1322
+ pdf | 1232
+ file/unknown | 1199
+ application/pdf' | 1192
+ file | 979
+ (25 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 101494314 | 18919012
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 93730358
+ 500 | 7759103
+ -4 | 4683
+ 503 | 150
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 80838234
+ | 12892145
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 27653003
+ pdf | doi | 17362763
+ pdf | pmc | 2248854
+ pdf | arxiv | 835400
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 197
+ (9 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 27653003
+ pdf | doi | fatcat-ingest | 8399261
+ pdf | doi | fatcat-changelog | 5449349
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 634665
+ pdf | pmc | fatcat-ingest | 210453
+ pdf | arxiv | fatcat-changelog | 200707
+ pdf | pmc | fatcat-changelog | 9582
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 197
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 21
+ pdf | arxiv | savepapernow-web | 2
+ (17 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 170304
+ pdf | oai | 15287
+ pdf | unpaywall | 1
+ (3 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14144314 | 0.276
+ pdf | mag | 35015357 | 24811947 | 0.709
+ pdf | unpaywall | 27653003 | 22302629 | 0.807
+ pdf | doi | 17363369 | 3533568 | 0.204
+ pdf | pmc | 2248860 | 1713197 | 0.762
+ pdf | arxiv | 835400 | 685219 | 0.820
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 197 | 138 | 0.701
+ (9 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 58265365
+ pdf | no-pdf-link | 27216435
+ pdf | no-capture | 21982611
+ pdf | redirect-loop | 8457469
+ pdf | terminal-bad-status | 2695023
+ pdf | link-loop | 2209672
+ pdf | wrong-mimetype | 767508
+ pdf | gateway-timeout | 548870
+ pdf | cdx-error | 391611
+ pdf | skip-url-blocklist | 220661
+ pdf | null-body | 182215
+ pdf | wayback-error | 146869
+ pdf | spn2-cdx-lookup-failure | 107229
+ pdf | spn-error | 85128
+ pdf | invalid-host-resolution | 37352
+ pdf | petabox-error | 32490
+ pdf | spn2-error | 29212
+ pdf | spn-remote-error | 27927
+ pdf | other-mimetype | 2305
+ pdf | bad-redirect | 1524
+ pdf | spn2-error:job-failed | 1521
+ pdf | timeout | 842
+ pdf | spn2-error:soft-time-limit-exceeded | 793
+ pdf | redirects-exceeded | 748
+ pdf | spn2-error:invalid-url-syntax | 417
+ (25 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*)
+ FROM ingest_file_result
+ WHERE hit = false
+ GROUP BY ingest_type, terminal_status_code
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 34064937
+ pdf | | 20514531
+ pdf | 301 | 7271700
+ pdf | 302 | 720632
+ pdf | 503 | 712697
+ pdf | 400 | 444209
+ pdf | 404 | 331495
+ pdf | 403 | 323030
+ pdf | 401 | 259327
+ pdf | 500 | 236122
+ pdf | 303 | 101609
+ pdf | 429 | 47738
+ pdf | 502 | 36183
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14790
+ pdf | 999 | 8996
+ pdf | 307 | 3769
+ pdf | 308 | 3422
+ pdf | 202 | 3228
+ pdf | 520 | 2058
+ pdf | 410 | 1734
+ pdf | 521 | 1033
+ pdf | 504 | 868
+ pdf | 505 | 424
+ (25 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 6600758 | 5213294
+ (1 row)
+
diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt
new file mode 100644
index 0000000..fca76b9
--- /dev/null
+++ b/sql/stats/2021-04-07_stats.txt
@@ -0,0 +1,430 @@
+
+## SQL Table Sizes
+
+ Size: 551.34G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 50 GB | 100 GB
+ "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB
+ "public"."ingest_request" | 39 GB | 45 GB | 83 GB
+ "public"."grobid" | 70 GB | 8613 MB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB
+ "public"."file_meta" | 35 GB | 31 GB | 66 GB
+ "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB
+ "public"."shadow" | 9517 MB | 10 GB | 20 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."html_meta" | 1172 MB | 10 MB | 1182 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (12 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 174200807 | 234313766162033
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 173816433
+ application/octet-stream | 155534
+ text/html | 115821
+ application/xml | 42170
+ application/xhtml+xml | 24347
+ text/plain | 15990
+ application/jats+xml | 6899
+ application/gzip | 6491
+ | 6034
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ application/x-bzip2 | 891
+ image/jpeg | 721
+ image/gif | 389
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297
+ application/x-compress | 272
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ image/png | 88
+ application/mac-binhex40 | 79
+ application/x-dosexec | 51
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ text/rtf | 33
+ application/x-dvi | 29
+ application/x-rar | 29
+ application/vnd.ms-excel | 28
+ message/rfc822 | 26
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62271
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 113880640 | 141793694
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 131346703
+ warc/revisit | 8394443
+ text/xml | 525481
+ application/octet-stream | 502400
+ text/html | 417579
+ unk | 186703
+ application/postscript | 81095
+ application/save | 80915
+ binary/octet-stream | 66698
+ application/x-download | 35771
+ text/plain | 35606
+ image/pdf | 33904
+ application/download | 29701
+ application/force-download | 16726
+ multipart/form-data | 6878
+ application/x-msdownload | 3843
+ application | 3724
+ application/x-octetstream | 3550
+ .pdf | 3138
+ application/x-pdf | 2780
+ application/binary | 1332
+ pdf | 1247
+ file/unknown | 1200
+ application/pdf' | 1192
+ file | 1108
+ application/unknown | 978
+ application/octetstream | 856
+ application/blob | 673
+ text/pdf | 672
+ 0 | 546
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 105594307 | 19594878
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+----------
+ 200 | 97714631
+ 500 | 7875192
+ -4 | 4772
+ 503 | 520
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 84822508
+ | 12892147
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 31772942
+ pdf | doi | 23528817
+ pdf | doaj | 4264610
+ html | doaj | 2429003
+ pdf | pmc | 2277417
+ pdf | arxiv | 2143549
+ xml | doaj | 9442
+ html | doi | 3022
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 469
+ html | spn | 9
+ (14 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 31772942
+ pdf | doi | fatcat-changelog | 11010764
+ pdf | doi | fatcat-ingest | 9002119
+ pdf | doaj | doaj | 4264610
+ pdf | doi | fatcat-ingest-container | 3515873
+ html | doaj | doaj | 2429003
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1767703
+ pdf | arxiv | fatcat-changelog | 375818
+ pdf | pmc | fatcat-ingest | 211264
+ pdf | pmc | fatcat-changelog | 37328
+ xml | doaj | doaj | 9442
+ html | doi | fatcat-ingest | 3018
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 469
+ pdf | doi | savepapernow-web | 74
+ pdf | arxiv | fatcat-ingest-container | 26
+ html | spn | savepapernow-web | 9
+ html | doi | savepapernow-web | 4
+ pdf | arxiv | savepapernow-web | 2
+ (23 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 168462
+ pdf | oai | 15286
+ pdf | doaj | 2068
+ html | doaj | 620
+ pdf | unpaywall | 13
+ (5 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14163500 | 0.277
+ pdf | mag | 35015357 | 24818176 | 0.709
+ pdf | unpaywall | 31772942 | 25018501 | 0.787
+ pdf | doi | 23529041 | 5773728 | 0.245
+ pdf | doaj | 4264610 | 2851328 | 0.669
+ html | doaj | 2429003 | 122937 | 0.051
+ pdf | pmc | 2277417 | 1736491 | 0.762
+ pdf | arxiv | 2143549 | 2011378 | 0.938
+ xml | doaj | 9442 | 6897 | 0.730
+ html | doi | 3022 | 957 | 0.317
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 469 | 328 | 0.699
+ html | spn | 9 | 2 | 0.222
+ (14 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------------+----------
+ pdf | success | 66487928
+ pdf | no-pdf-link | 29279677
+ pdf | no-capture | 22765431
+ pdf | redirect-loop | 9155767
+ pdf | terminal-bad-status | 3549665
+ pdf | link-loop | 2592983
+ html | wrong-scope | 1088793
+ pdf | wrong-mimetype | 792563
+ pdf | gateway-timeout | 478181
+ html | no-capture | 423917
+ pdf | wayback-content-error | 355828
+ pdf | cdx-error | 343862
+ pdf | null-body | 328774
+ pdf | forbidden | 286647
+ pdf | spn2-cdx-lookup-failure | 276769
+ pdf | spn2-wayback-error | 276080
+ pdf | skip-url-blocklist | 265473
+ html | redirect-loop | 212916
+ pdf | not-found | 204367
+ html | unknown-scope | 204112
+ html | html-resource-no-capture | 166034
+ pdf | blocked-cookie | 160336
+ pdf | too-many-redirects | 152984
+ html | success | 123896
+ pdf | wayback-error | 114388
+ html | null-body | 100296
+ pdf | spn2-error:too-many-redirects | 58336
+ html | wayback-content-error | 53926
+ pdf | invalid-host-resolution | 37226
+ pdf | petabox-error | 37177
+ pdf | remote-server-error | 36439
+ pdf | spn2-error | 27556
+ pdf | spn2-error:proxy-error | 25486
+ pdf | read-timeout | 20745
+ html | wrong-mimetype | 18928
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13533
+ pdf | bad-redirect | 7535
+ xml | success | 6897
+ html | cdx-error | 6823
+ pdf | spn2-error:bad-request | 4664
+ pdf | spn2-error:unauthorized | 4391
+ pdf | spn-remote-error | 4206
+ pdf | spn2-error:service-unavailable | 2614
+ pdf | spn2-error:job-failed | 2562
+ xml | null-body | 2353
+ pdf | other-mimetype | 2304
+ pdf | error | 1905
+ html | spn2-cdx-lookup-failure | 1018
+ pdf | redirects-exceeded | 1015
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 36515867
+ pdf | | 22909334
+ pdf | 301 | 7969702
+ html | 200 | 1653303
+ pdf | 503 | 928507
+ pdf | 403 | 823755
+ pdf | 302 | 792842
+ pdf | 400 | 462108
+ html | | 426474
+ pdf | 404 | 422163
+ pdf | 401 | 270611
+ pdf | 500 | 248675
+ html | 301 | 211713
+ pdf | 303 | 109686
+ pdf | 410 | 50648
+ pdf | 502 | 37663
+ pdf | 429 | 31982
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14835
+ html | 404 | 9573
+ pdf | 999 | 9296
+ pdf | 307 | 3972
+ pdf | 308 | 3914
+ html | 500 | 3625
+ pdf | 202 | 3515
+ xml | 200 | 2537
+ pdf | 520 | 2072
+ pdf | 206 | 1665
+ pdf | 521 | 1075
+ html | 302 | 1072
+ pdf | 504 | 1000
+ pdf | 412 | 476
+ pdf | 300 | 434
+ pdf | 505 | 429
+ pdf | 406 | 393
+ html | 403 | 382
+ html | 503 | 378
+ pdf | 421 | 298
+ html | 303 | 268
+ pdf | 508 | 195
+ pdf | 226 | 166
+ pdf | 402 | 70
+ html | 502 | 68
+ pdf | 408 | 50
+ pdf | 204 | 34
+ pdf | 416 | 29
+ pdf | 501 | 29
+ pdf | 530 | 27
+ pdf | 507 | 21
+ (50 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 8514315 | 6401104
+ (1 row)
diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt
new file mode 100644
index 0000000..a8a9cd5
--- /dev/null
+++ b/sql/stats/2021-04-08_table_sizes.txt
@@ -0,0 +1,40 @@
+
+## SQL Table Sizes
+
+ Size: 467.23G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 26 GB | 76 GB
+ "public"."grobid" | 69 GB | 6834 MB | 75 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."ingest_request" | 39 GB | 32 GB | 70 GB
+ "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB
+ "public"."file_meta" | 32 GB | 21 GB | 53 GB
+ "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ (12 rows)
+
diff --git a/sql/stats/README.md b/sql/stats/README.md
new file mode 100644
index 0000000..62e213c
--- /dev/null
+++ b/sql/stats/README.md
@@ -0,0 +1,120 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
diff --git a/sql/table_sizes.md b/sql/table_sizes.md
new file mode 100644
index 0000000..3596b2b
--- /dev/null
+++ b/sql/table_sizes.md
@@ -0,0 +1,11 @@
+
+## September 2019
+
+ table_name | table_size | indexes_size | total_size
+ --------------------------------------------------------------+------------+--------------+------------
+ "public"."cdx" | 31 GB | 27 GB | 58 GB
+ "public"."file_meta" | 13 GB | 6500 MB | 19 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."grobid" | 4994 MB | 6678 MB | 11 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB