aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-06 18:32:35 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-06 18:32:35 -0800
commit175019c96fced3e21d0f60ea1a4a37da6b8872ac (patch)
treef42fbbe9c8ac06ae9eb06373ab9eec96d2b3a177
parentb0b66c20c6ffb9d8acc626068964d7dfd5d3bcdc (diff)
parent47ca1a273912c8836630b0930b71a4e66fd2c85b (diff)
downloadsandcrawler-175019c96fced3e21d0f60ea1a4a37da6b8872ac.tar.gz
sandcrawler-175019c96fced3e21d0f60ea1a4a37da6b8872ac.zip
Merge branch 'bnewbold-html-ingest'
-rw-r--r--kafka/topics.md16
-rwxr-xr-xplease2
-rw-r--r--proposals/20201026_html_ingest.md127
-rw-r--r--proposals/20201103_xml_ingest.md81
-rw-r--r--python/.pylintrc2
-rw-r--r--python/Makefile5
-rw-r--r--python/Pipfile12
-rw-r--r--python/Pipfile.lock1019
-rw-r--r--python/example.env4
-rwxr-xr-xpython/grobid2json.py2
-rwxr-xr-xpython/grobid_tool.py2
-rwxr-xr-xpython/ingest_file.py2
-rwxr-xr-xpython/pdfextract_tool.py2
-rwxr-xr-xpython/pdftrio_tool.py2
-rwxr-xr-xpython/persist_tool.py18
-rw-r--r--python/pytest.ini3
-rw-r--r--python/sandcrawler/db.py61
-rw-r--r--python/sandcrawler/html_ingest.py337
-rw-r--r--python/sandcrawler/html_metadata.py452
-rw-r--r--python/sandcrawler/ia.py64
-rw-r--r--python/sandcrawler/ingest.py367
-rw-r--r--python/sandcrawler/minio.py4
-rw-r--r--python/sandcrawler/misc.py47
-rw-r--r--python/sandcrawler/persist.py105
-rw-r--r--python/sandcrawler/xml.py7
-rwxr-xr-xpython/sandcrawler_worker.py79
-rw-r--r--python/tests/files/dlib_05vanhyning.html350
-rw-r--r--python/tests/files/first_monday_ojs3_fulltext.html441
-rw-r--r--python/tests/files/first_monday_ojs3_landingpage.html616
-rw-r--r--python/tests/files/genders_g58_fairlie.html146
-rw-r--r--python/tests/files/nature_article.html1379
-rw-r--r--python/tests/files/peerj_oa_article.html2365
-rw-r--r--python/tests/files/scielo_article.jats.xml336
-rw-r--r--python/tests/test_html_ingest.py14
-rw-r--r--python/tests/test_html_metadata.py227
-rw-r--r--python/tests/test_pdfextract.py2
-rw-r--r--python/tests/test_xml.py18
-rw-r--r--sql/dump_unmatched_glutton_pdf.sql19
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql15
-rw-r--r--sql/monitoring_queries.md26
40 files changed, 8227 insertions, 549 deletions
diff --git a/kafka/topics.md b/kafka/topics.md
index 7a34c83..06faf8e 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -59,6 +59,18 @@ retention (on both a size and time basis).
=> 12 partitions
=> key is sha1hex of PDF; enable key compaction; gzip compression
+ sandcrawler-ENV.xml-doc
+ => fulltext XML; mostly JATS XML
+ => schema is JSON, with 'jats_xml' field containing the XML as a string
+ => 6 partitions
+ => key is sha1hex of XML document; enable key compaction; gzip compression
+
+ sandcrawler-ENV.html-teixml
+ => extracted fulltext from HTML; mostly TEI-XML
+ => schema is JSON, with 'tei_xml' field containing the XML as a string
+ => 6 partitions
+ => key is sha1hex of source HTML document; enable key compaction; gzip compression
+
sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE
=> thumbnail images (eg, png, jpg) from PDFs
=> raw bytes in message (no JSON or other wrapping). fields average 10 KByte
@@ -194,3 +206,7 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic scholar-qa.sim-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic scholar-qa.update-docs --config compression.type=gzip --config cleanup.policy=compact --config retention.ms=7889400000
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.xml-doc --config compression.type=gzip --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.html-teixml --config compression.type=gzip --config cleanup.policy=compact
+
diff --git a/please b/please
index 10fa843..4800112 100755
--- a/please
+++ b/please
@@ -600,7 +600,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do! (try --help)")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
if not (args.prod or args.qa) or (args.prod and args.qa):
print("must pass one of --prod or --qa")
diff --git a/proposals/20201026_html_ingest.md b/proposals/20201026_html_ingest.md
new file mode 100644
index 0000000..c06f180
--- /dev/null
+++ b/proposals/20201026_html_ingest.md
@@ -0,0 +1,127 @@
+
+status: wip
+
+HTML Ingest Pipeline
+========================
+
+Basic goal: given an ingest request of type 'html', output an object (JSON)
+which could be imported into fatcat.
+
+Should work with things like (scholarly) blog posts, micropubs, registrations,
+protocols. Doesn't need to work with everything to start. "Platform" sites
+(like youtube, figshare, etc) will probably be a different ingest worker.
+
+A current unknown is what the expected size of this metadata is. Both in number
+of documents and amount of metadata per document.
+
+Example HTML articles to start testing:
+
+- complex distill article: <https://distill.pub/2020/bayesian-optimization/>
+- old HTML journal: <http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm>
+- NIH pub: <https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html>
+- first mondays (OJS): <https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729>
+- d-lib: <http://www.dlib.org/dlib/july17/williams/07williams.html>
+
+
+## Ingest Process
+
+Follow base URL to terminal document, which is assumed to be a status=200 HTML document.
+
+Verify that terminal document is fulltext. Extract both metadata and fulltext.
+
+Extract list of sub-resources. Filter out unwanted (eg favicon, analytics,
+unnecessary), apply a sanity limit. Convert to fully qualified URLs. For each
+sub-resource, fetch down to the terminal resource, and compute hashes/metadata.
+
+Open questions:
+
+- will probably want to parallelize sub-resource fetching. async?
+- behavior when failure fetching sub-resources
+
+
+## Ingest Result Schema
+
+JSON should be basically compatible with existing `ingest_file_result` objects,
+with some new sub-objects.
+
+Overall object (`IngestWebResult`):
+
+- `status`: str
+- `hit`: bool
+- `error_message`: optional, if an error
+- `hops`: optional, array of URLs
+- `cdx`: optional; single CDX row of primary HTML document
+- `terminal`: optional; same as ingest result
+ - `terminal_url`
+ - `terminal_dt`
+ - `terminal_status_code`
+ - `terminal_sha1hex`
+- `request`: optional but usually present; ingest request object, verbatim
+- `file_meta`: optional; file metadata about primary HTML document
+- `html_biblio`: optional; extracted biblio metadata from primary HTML document
+- `scope`: optional; detected/guessed scope (fulltext, etc)
+- `html_resources`: optional; array of sub-resources. primary HTML is not included
+- `html_body`: optional; just the status code and some metadata is passed through;
+ actual document would go through a different KafkaTopic
+ - `status`: str
+ - `agent`: str, eg "trafilatura/0.4"
+ - `tei_xml`: optional, str
+ - `word_count`: optional, str
+
+
+## New SQL Tables
+
+`html_meta`
+ sha1hex (primary key)
+ updated (of SQL row)
+ status
+ scope
+ has_teixml
+ has_thumbnail
+ word_count (from teixml fulltext)
+ biblio (JSON)
+ resources (JSON)
+
+Also writes to `ingest_file_result`, `file_meta`, and `cdx`, all only for the base HTML document.
+
+
+## Fatcat API Wants
+
+Would be nice to have lookup by SURT+timestamp, and/or by sha1hex of terminal base file.
+
+`hide` option for cdx rows; also for fileset equivalent.
+
+
+## New Workers
+
+Could reuse existing worker, have code branch depending on type of ingest.
+
+ingest file worker
+ => same as existing worker, because could be calling SPN
+
+persist result
+ => same as existing worker; adds persisting various HTML metadata
+
+persist html text
+ => talks to seaweedfs
+
+
+## New Kafka Topics
+
+HTML ingest result topic (webcapture-ish)
+
+sandcrawler-ENV.html-teixml
+ JSON wrapping TEI-XML (same as other fulltext topics)
+ key compaction and content compression enabled
+
+JSON schema:
+
+- `key` and `sha1hex`: str; used as kafka key
+- `status`: str
+- `tei_xml`: str, optional
+- `word_count`: int, optional
+
+## New S3/SeaweedFS Content
+
+`sandcrawler` bucket, `html` folder, `.tei.xml` suffix.
+
diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md
new file mode 100644
index 0000000..181cc11
--- /dev/null
+++ b/proposals/20201103_xml_ingest.md
@@ -0,0 +1,81 @@
+
+status: wip
+
+TODO:
+x XML fulltext URL extractor (based on HTML biblio metadata, not PDF url extractor)
+x differential JATS XML and scielo XML from generic XML?
+ application/xml+jats is what fatcat is doing for abstracts
+ but it should be application/jats+xml?
+ application/tei+xml
+ if startswith "<article " and "<article-meta>" => JATS
+x refactor ingest worker to be more general
+x have ingest code publish body to kafka topic
+x write a persist worker
+/ create/configure kafka topic
+- test everything locally
+- fatcat: ingest tool to create requests
+- fatcat: entity updates worker creates XML ingest requests for specific sources
+- fatcat: ingest file import worker allows XML results
+- ansible: deployment of persist worker
+
+XML Fulltext Ingest
+====================
+
+This document details changes to include XML fulltext ingest in the same way
+that we currently ingest PDF fulltext.
+
+Currently this will just fetch the single XML document, which is often lacking
+figures, tables, and other required files.
+
+## Text Encoding
+
+Because we would like to treat XML as a string in a couple contexts, but XML
+can have multiple encodings (indicated in an XML header), we are in a bit of a
+bind. Simply parsing into unicode and then re-encoding as UTF-8 could result in
+a header/content mismatch. Any form of re-encoding will change the hash of the
+document. For recording in fatcat, the file metadata will be passed through.
+For storing in Kafka and blob store (for downstream analysis), we will parse
+the raw XML document (as "bytes") with an XML parser, then re-output with UTF-8
+encoding. The hash of the *original* XML file will be used as the key for
+refering to this document. This is unintuitive, but similar to what we are
+doing with PDF and HTML documents (extracting in a useful format, but keeping
+the original document's hash as a key).
+
+Unclear if we need to do this re-encode process for XML documents already in
+UTF-8 encoding.
+
+## Ingest Worker
+
+Could either re-use HTML metadata extractor to fetch XML fulltext links, or
+fork that code off to a separate method, like the PDF fulltext URL extractor.
+
+Hopefully can re-use almost all of the PDF pipeline code, by making that ingest
+worker class more generic and subclassing it.
+
+Result objects are treated the same as PDF ingest results: the result object
+has context about status, and if successful, file metadata and CDX row of the
+terminal object.
+
+TODO: should it be assumed that XML fulltext will end up in S3 bucket? or
+should there be an `xml_meta` SQL table tracking this, like we have for PDFs
+and HTML?
+
+TODO: should we detect and specify the XML schema better? Eg, indicate if JATS.
+
+
+## Persist Pipeline
+
+### Kafka Topic
+
+sandcrawler-ENV.xml-doc
+ similar to other fulltext topics; JSON wrapping the XML
+ key compaction, content compression
+
+### S3/SeaweedFS
+
+`sandcrawler` bucket, `xml` folder. Extension could depend on sub-type of XML?
+
+### Persist Worker
+
+New S3-only worker that pulls from kafka topic and pushes to S3. Works
+basically the same as PDF persist in S3-only mode, or like pdf-text worker.
diff --git a/python/.pylintrc b/python/.pylintrc
index 80e203d..387bca1 100644
--- a/python/.pylintrc
+++ b/python/.pylintrc
@@ -11,4 +11,4 @@ include-ids=yes
notes=FIXME,XXX,DELETEME
[TYPECHECK]
-ignored-modules=responses
+extension-pkg-whitelist=selectolax,pydantic,responses
diff --git a/python/Makefile b/python/Makefile
index f783d0e..0a97437 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -17,6 +17,7 @@ lint: ## Run lints (eg, flake8, mypy)
#pipenv run flake8 . --exit-zero
pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero
pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
+ pipenv run pylint --rcfile=.pylintrc -E --jobs=4 sandcrawler tests *.py
#pipenv run pytype sandcrawler/
.PHONY: fmt
@@ -24,7 +25,9 @@ fmt: ## Run code formating on all source code
pipenv run black *.py sandcrawler/ tests/
.PHONY: test
-test: lint ## Run all tests and lints
+test: ## Run all tests and lints
+ pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero
+ pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
pipenv run pytest
.PHONY: coverage
diff --git a/python/Pipfile b/python/Pipfile
index 17734ad..3d39fa5 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -38,6 +38,14 @@ Flask = ">=1"
urlcanon = "*"
pillow = ">=3"
python-poppler = ">=0.2.1"
+selectolax = ">=0.2"
+trafilatura = "*"
+pydantic = ">=1.7"
+dateparser = "*"
+braveblock = "*"
+dynaconf = ">=3"
+sentry-sdk = { version = ">=0.14.0", extras = [] }
+zstandard = "*"
# must lock black to an exact version because it is still "beta"
# see: https://github.com/psf/black/issues/517
@@ -47,9 +55,9 @@ black = "==19.10b0"
python_version = "3.7"
[packages.globalwayback]
-version = ">=0.3"
+version = ">=0.6.5"
index = "ia"
[packages.wayback]
-version = ">=0.2.1.2"
+version = ">=0.6.3"
index = "ia"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index fcc1434..68322e6 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "0710cce29b75fe2092b0bf2cbbe758688e6ffb34dc26a01fc769007bd1c66f2c"
+ "sha256": "747d6865429ecfbd558a7583b385e213de4e53145a3dab9fdbc5c93f6872901f"
},
"pipfile-spec": 6,
"requires": {
@@ -21,22 +21,6 @@
]
},
"default": {
- "aerospike": {
- "hashes": [
- "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620",
- "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d",
- "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289",
- "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0",
- "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40",
- "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb",
- "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c",
- "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1",
- "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11",
- "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0",
- "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8"
- ],
- "version": "==3.10.0"
- },
"appdirs": {
"hashes": [
"sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41",
@@ -46,25 +30,18 @@
},
"attrs": {
"hashes": [
- "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
- "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
+ "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594",
+ "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc"
],
- "version": "==19.3.0"
- },
- "backports.csv": {
- "hashes": [
- "sha256:1277dfff73130b2e106bf3dd347adb3c5f6c4340882289d88f31240da92cbd6d",
- "sha256:21f6e09bab589e6c1f877edbc40277b65e626262a86e69a70137db714eaac5ce"
- ],
- "version": "==1.0.7"
+ "version": "==20.2.0"
},
"beautifulsoup4": {
"hashes": [
- "sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7",
- "sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8",
- "sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c"
+ "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
+ "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
+ "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
],
- "version": "==4.9.1"
+ "version": "==4.9.3"
},
"black": {
"hashes": [
@@ -82,56 +59,71 @@
},
"boto3": {
"hashes": [
- "sha256:16f83ca3aa98d3faeb4f0738b878525770323e5fb9952435ddf58ca09aacec7c",
- "sha256:dc87ef82c81d2938f91c7ebfa85dfd032fff1bd3b67c9f66d74b21f8ec1e353d"
+ "sha256:07c61751a31a3f79fc6871648aaf9c62a7242c1e5f0777a71a4c36bdb4ff2642",
+ "sha256:fe5fbcf988e6f5b87a31ddd6ba92a6485d35f666d26b9c9750263aa8b0d33b60"
],
"index": "ia",
- "version": "==1.14.10"
+ "version": "==1.16.11"
},
"botocore": {
"hashes": [
- "sha256:b22db58da273b77529edef71425f9c281bc627b1b889f81960750507238abbb8",
- "sha256:cb0d7511a68439bf6f16683489130e06c5bbf9f5a9d647e0cbf63d79f3d3bdaa"
+ "sha256:85c49ec723e86317af46594f9a1843c6d64ecc3ab43c6e0a58a57d9c7cbbebe5",
+ "sha256:ada6405f3ff3fcd83058aa5060553f3885aa831dcd385f0a89301b770a1c5dcc"
+ ],
+ "version": "==1.19.11"
+ },
+ "braveblock": {
+ "hashes": [
+ "sha256:2cf61c88c283687c38e6c0301cd1c676c861adde4bf43ed5d1b95096f59e8903",
+ "sha256:4fab507a6e4a2d4c34dc28e3b25e178ea19511c75f69dfce25a141f3aacf8cd9",
+ "sha256:57b1dd76f2e787edfa899276268e07c1a06ae6cd4ff5475560a18250628e1f00",
+ "sha256:5a77c443b67c0c6b2e9c1cb62b13e1c795fbba07c2b574b7652e4cd89bf782e3",
+ "sha256:5c03485a4ec143ff8caf5a393263f40a74b401c7c194131d06c9269afc8ef013",
+ "sha256:66146166fed2905e34a466d63ce1c8b19fca53a74118092651d13d19e644327a",
+ "sha256:786d2db7551f3defe2fa089aa0b8c225b90b102b540e132a7fe409529bba08f2",
+ "sha256:9a2a7bf2c0b0b1b5a6a7cf1abea6315a97c380341e19d4348e153a21322fc249",
+ "sha256:c7f3e4d0244bb33d9643e87522e23086b602959ab5cb525b3db3adc94c38d5b1",
+ "sha256:ddd8691691b3066128f440680dfef6edd72eb9711f1032a029ab43b7cbf726cd",
+ "sha256:f9af63f32d51786497190ae2c171bf8ce678612a5a52d5b4a58e303376d7b81e",
+ "sha256:fcf9548015231195906a88a3727d716e155e82d2b9f926332e4a5ce38e8893e5"
],
- "version": "==1.17.10"
+ "index": "ia",
+ "version": "==0.1.8"
},
"brotli": {
"hashes": [
- "sha256:0538dc1744fd17c314d2adc409ea7d1b779783b89fd95bcfb0c2acc93a6ea5a7",
- "sha256:0970a47f471782912d7705160b2b0a9306e68e6fadf9cffcaeb42d8f0951e26c",
- "sha256:113f51658e6fe548dce4b3749f6ef6c24de4184ba9c10a909cbee4261c2a5da0",
- "sha256:1e1aa9c4d1558889f42749c8baf846007953bfd32c8209230cf1cd1f5ef33495",
- "sha256:2f2f4f78f29ac4a45d15b3d9fc3fd9705e0ad313a44b129f6e1d0c6916bad0e2",
- "sha256:3269f6de1dd150fd0cce1c158b61ff5ac06d627fd3ae9c6ea03aed26fbbff7ea",
- "sha256:3f4a1f6240916c7984c7f2542786710f622992508dafee0b1714e6d340fb9ffd",
- "sha256:50dd9ad2a2bb12da4e9002a438672d182f98e546e99952de80280a1e1729664f",
- "sha256:5519a4b01b1a4f965083cbfa2ef2b9774c5a5f352341c47b50776ad109423d72",
- "sha256:5eb27722d320370315971c427eb8aa7cc0791f2a458840d357ac653bd0ad3a14",
- "sha256:5f06b4d5b6f58e5b5c220c2f23cad034dc5efa51b01fde2351ced1605bd980e2",
- "sha256:71ceee286ea7ec613f1c36f1c6181864a6ca24ebb55e371276f33d6af8742834",
- "sha256:72848d25a5f9e736db4af4512e0c3feecc094d57d241f8f1ae959115a2c39756",
- "sha256:743001bca75f4a6b4454be3510feca46f9d61a0c782a9bc2bc684bdb245e279e",
- "sha256:7ac98c71a15648fd11bc1f32608b6110e396121280790082e32b9a3109048bc6",
- "sha256:9d1c2dd27a1083fefd05b1b2f8df4a6bc2aaa6c21dd82cd41c8ae5e7c23a87f8",
- "sha256:a13ce9b419fe9f277c63f700efb0e444331509d1881b5610d2ba7e9080606967",
- "sha256:a19ef0952b9d2803df88dff07f45a6c92d5676afb9b8d69cf32232d684036d11",
- "sha256:ad766ca8b8c1419b71a22756b45264f45725c86133dc80a7cbe30b6b78c75620",
- "sha256:ad7963f261988ee0883816b6b9f206f11461c9b3cb5cfbca0c9ab5adc406d395",
- "sha256:af0451e23016631a2f52925a10d738ac4a0f794ac315c30380b22efc0c90cbc6",
- "sha256:c16201060c5a3f8742e3deae759014251ac92f382f82bc2a41dc079ff18c3f24",
- "sha256:c43b202f65891861a9a336984a103de25de235f756de69e32db893156f767013",
- "sha256:c675c6cce4295cb1a692f3de7416aacace7314e064b94bc86e93aceefce7fd3e",
- "sha256:d17cec0b992b1434f5f9df9986563605a4d1b1acd5574c87fc2ac014bcbd3316",
- "sha256:dc91f6129953861a73d9a65c52a8dd682b561a9ebaf65283541645cab6489917",
- "sha256:e2f4cbd1760d2bf2f30e396c2301999aab0191aec031a6a8a04950b2f575a536",
- "sha256:f192e6d3556714105c10486bbd6d045e38a0c04d9da3cef21e0a8dfd8e162df4",
- "sha256:f775b07026af2b1b0b5a8b05e41571cdcf3a315a67df265d60af301656a5425b",
- "sha256:f969ec7f56ba9636679e69ca07fba548312ccaca37412ee823c7f413541ad7e0",
- "sha256:f9dc52cd70907aafb99a773b66b156f2f995c7a0d284397c487c8b71ddbef2f9",
- "sha256:f9ee88bb52352588ceb811d045b5c9bb1dc38927bc150fd156244f60ff3f59f1",
- "sha256:fc7212e36ebeb81aebf7949c92897b622490d7c0e333a479c0395591e7994600"
- ],
- "version": "==1.0.7"
+ "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
+ "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
+ "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
+ "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
+ "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
+ "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
+ "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
+ "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
+ "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
+ "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
+ "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
+ "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
+ "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
+ "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
+ "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
+ "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
+ "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
+ "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
+ "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
+ "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
+ "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
+ "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
+ "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
+ ],
+ "version": "==1.0.9"
},
"bs4": {
"hashes": [
@@ -163,42 +155,42 @@
},
"configparser": {
"hashes": [
- "sha256:2ca44140ee259b5e3d8aaf47c79c36a7ab0d5e94d70bd4105c03ede7a20ea5a1",
- "sha256:cffc044844040c7ce04e9acd1838b5f2e5fa3170182f6fda4d2ea8b0099dbadd"
+ "sha256:005c3b102c96f4be9b8f40dafbd4997db003d07d1caa19f37808be8031475f2a",
+ "sha256:08e8a59ef1817ac4ed810bb8e17d049566dd6e024e7566f6285c756db2bb4ff8"
],
- "version": "==5.0.0"
+ "version": "==5.0.1"
},
"confluent-kafka": {
"hashes": [
- "sha256:1b10a9e4ede8c7ee382c16075b55275963d3fe9b8eec3fc511d0868847cc6eed",
- "sha256:1c46cbc2eb0876f0cdbd33ed7ea684ed1b009a25b65cf87736d3506d2f4ae57e",
- "sha256:2500a78334d642e49b98710722e548c0e3d5dc4c6eae63f02d66448678ed2922",
- "sha256:2515771b18d190df2182881abcf02fe8fde0aab567402ff36295b35cd495de65",
- "sha256:3150c8875511e2cea4086206f3c10448f744c9c35f9033fd0874c8c55f7b87e2",
- "sha256:4b0a3c47f9183570e9ee77ae8c36080fbc1996045251e25772944e4dadf1db21",
- "sha256:4f875798bbc766767b9c6ed95b084fde851e0bf074527ab0daffa87f4e750635",
- "sha256:515049659b045b99e0464d5ff5def4785478490563bc5ac1341a4f29dc335e82",
- "sha256:52088adf1abdf3a384a54ec7a3bfaa0b61e5da8cc03a2e26a8351bbbf49f72a9",
- "sha256:5342d3ff348b8082eaa4c63f4c82a72f3bf0ef8efa12a8580c890fa6e160f761",
- "sha256:55734905c5a8642e596cf1e60ec4d86f05d31a185cbc71d1c73430bb0c08db19",
- "sha256:624349587e97135996383c58edd8d53b38c57d653e6536c1f816049fc75faea3",
- "sha256:804a7d71b3cb61444930af67986064c9555b8c33f05a27003ea314d6c847e522",
- "sha256:931231853cec933addfafa27772177dcfab899d82e2e39fe7485c0602088daf7",
- "sha256:a4f5edc1d7958bbf5f12ba83c1f83e22a66daa9c4318c7f28c5bb1db9289fe09",
- "sha256:a591936a90095144451f041315239b2c823b7a15fa820cf45e45c422591345d6",
- "sha256:a6eb8f3f553e98a6ef0d00f9cf8e4e8dde73c914a43a00fecef97330de80bcea",
- "sha256:aa48215edcf16071d44ba29951c82c5f541d5ec915590aff0b4240e8e13f3ba3",
- "sha256:bfacb9fa0e3a5e31a5ac9a5da15de656e95e7153e022ec5620095b76a6098ec0",
- "sha256:bfbcbe7068690369ac2de3fe953854de34ad5e901157e96bcb990ca8b86d1d93",
- "sha256:c2660807e5c1ecd723e280f76918794c3fd84595000c1e8de1f254f5d89a785c",
- "sha256:c42ff838ee5e248f95f65b5adca4e2fdd4a2817fa26cede36d83a426e0f1370c",
- "sha256:c5b741764d8ea2b8334fdaf4b56297c5bab780142f1c0cad0bd642cac30cb89e",
- "sha256:dac33a04f73093de275953867a05de244560aa9842def6316cbb52bc0f02eff3",
- "sha256:f1695a00789795f9f798588bb62688b563baf471a76ca20fa01c957844938d7d",
- "sha256:f25836e03559a381ba74b9a6940b716e61ba8ae2db2d5d3a40accbc60617e1af"
+ "sha256:00acc73f7d49961bf427f5e4fd6c0a220a6bfa5ccc91e0ad1f9ffa1751a169b0",
+ "sha256:0a59afbb90bdd22b9acdd3bb134f5ee1dff3cc5df55eaf52bf97b2f8d0d00de3",
+ "sha256:13b0e2011560f461ff39daf38089dd7f91404b3e66dba0456ccce0700f93c4f2",
+ "sha256:175c7064c8f19975616974558c45f42c147a202d4b1c0b0a83afefb920367696",
+ "sha256:22d7201d1aa89f1c5546749e781492925ed3eb0d7bd8f781fc57294cd45ddde3",
+ "sha256:3034cacc3b0d03eb3ce39cc5a64c1070d223870246f5d90c9113996be9db7df8",
+ "sha256:3e2d4f55ca952aeada3831d6615dc13a8a42c8e97175855ca08bbc6e6091b080",
+ "sha256:5a1c47320d6afc5b2599f8f8e143aed6845a2d903facde984606e02f10f11221",
+ "sha256:7b03bd9cc7b5e4df0a27eed359762c61a35313d4981ef1d9b418069eee454e66",
+ "sha256:85ff4823770ce2efaabb46d88e5ae26a840e0051fd481abaa805f21a5a84d003",
+ "sha256:9534cd2c0313df75b70eb4cf729382998970d97bbdda5cf3aef7081b855ccebe",
+ "sha256:99b13d0957a5967c85aee6138ef5f9acec90294267a549c5683744f20cf5d7b4",
+ "sha256:9a1c77291c1ac4b991aa0358f2f44636686eb8f52fb628502d30c312160a14e9",
+ "sha256:9ac812006000887f76c95b8a33a9f0b65845bf072fbc54a42a1acffd34e41120",
+ "sha256:9c47b8aacfe347bffd86bf75b98626718912b63df87f256dff1abc06a0355410",
+ "sha256:a116382ae67e0d6a54684bab4ee9b1be54e789d031a6e5e74c3edc657c79d23c",
+ "sha256:b1c89f3653385acc5da71570e03281f35ac6960367f2b2a426ae431deb1a1a35",
+ "sha256:bb77276d569f511abe4a5b32a53f8a30285bc7be68219e5711a44720bf356ac2",
+ "sha256:bbd9633552840ab9367fb762ea21272759db8caec2c34ff16ee28be177644cdf",
+ "sha256:bfdfa81e4e72d2c24e408a5e199aae0a477499ae40647dfa6906d002d9b07f38",
+ "sha256:c7461d6db081c23a6d38ceba348e7c178d7e974cf22c45ba8a4918ecb8855a44",
+ "sha256:d6a5d4c72360a75e875e88f7cce42b66a786d037ca2002303ab1c580d49caf53",
+ "sha256:dabed41cc60d1fc6d3cb44a90fe02e5192c9bf0f73c7b35761981e62ecabc592",
+ "sha256:dd544847c713eeeb525031348ff6ffea4ecdd11c13590893e599a9d4676a9bd4",
+ "sha256:eba169a9de8c978c9f33c763857c5279eceac46a4fd55a381c2528b9d4b3359e",
+ "sha256:f2d1ee0bfdf618017bbfaa42406546155c1a86263e4f286295318578c723803b"
],
"index": "ia",
- "version": "==1.4.2"
+ "version": "==1.5.0"
},
"contextlib2": {
"hashes": [
@@ -207,12 +199,34 @@
],
"version": "==0.6.0.post1"
},
+ "courlan": {
+ "hashes": [
+ "sha256:16b22e6b98838469793ce6c4b9501d7a7eff679c227a4d3c135349d1da12f623",
+ "sha256:649756066671c1fdcbef129766300aa1b1c5b2cf5bcdedcb0aadcd7f09cd5e6b"
+ ],
+ "version": "==0.2.3"
+ },
"crawllib": {
"hashes": [
"sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
],
"version": "==0.1.4.8"
},
+ "cssselect": {
+ "hashes": [
+ "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
+ "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
+ ],
+ "version": "==1.1.0"
+ },
+ "dateparser": {
+ "hashes": [
+ "sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a",
+ "sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8"
+ ],
+ "index": "ia",
+ "version": "==1.0.0"
+ },
"dawg": {
"hashes": [
"sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753",
@@ -241,26 +255,33 @@
],
"version": "==0.6.2"
},
- "docutils": {
- "hashes": [
- "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0",
- "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827",
- "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99"
- ],
- "version": "==0.15.2"
- },
"dogpile.cache": {
"hashes": [
"sha256:bc9dde1ffa5de0179efbcdc73773ef0553921130ad01955422f2932be35c059e"
],
"version": "==0.9.2"
},
+ "dynaconf": {
+ "hashes": [
+ "sha256:808adfe964f10695846dbf8dad7632e47fc3bc38860fd1887ed57dddffc4eff2",
+ "sha256:9b34ab2f811a81755f5eb4beac77a69e1e0887528c7e37fc4bc83fed52dcf502"
+ ],
+ "index": "ia",
+ "version": "==3.1.2"
+ },
"elasticsearch": {
"hashes": [
- "sha256:540d633afcc0a32972e4b489c4559c9a96e294850853238f7a18b1cbd267c2ed",
- "sha256:a8062a00b61bc7babeea028530667583a68ecb1a9f59ab0b22ff7feaf70d3564"
+ "sha256:5e08776fbb30c6e92408c7fa8c37d939210d291475ae2f364f0497975918b6fe",
+ "sha256:8c7e2374f53ee1b891ff2804116e0c7fb517585d6d5788ba668686bbc9d82e2d"
+ ],
+ "version": "==7.9.1"
+ },
+ "filelock": {
+ "hashes": [
+ "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59",
+ "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"
],
- "version": "==6.8.1"
+ "version": "==3.0.12"
},
"flask": {
"hashes": [
@@ -272,23 +293,30 @@
},
"ftfy": {
"hashes": [
- "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8"
+ "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720"
],
"index": "ia",
- "version": "==5.7"
+ "version": "==5.8"
},
"globalwayback": {
"hashes": [
- "sha256:46724c1445afa79f6e2d2ccf98e76eed072ff36df50409ed90ff26344a4b4ac4"
+ "sha256:429a88d91da6f4cd1eaa4f9beabc75c0d47271c7155de94a50f579f243c91323"
],
"index": "ia",
- "version": "==0.6.1"
+ "version": "==0.6.5"
+ },
+ "htmldate": {
+ "hashes": [
+ "sha256:03f4e9648bf5bade11ecdb2a825a06019fafbfdcafd88151a4ce0407325f43c7",
+ "sha256:2e383fdbac3fb8a3cc6307502d7b920bb10f938113a1d108ec315aa195a2bc28"
+ ],
+ "version": "==0.7.2"
},
"ialib": {
"hashes": [
- "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
+ "sha256:0b1745e512266fd6c91af68763f2f8427eec6c92c5009fc75c50d9352fc764fc"
],
- "version": "==0.3.0.1"
+ "version": "==0.5.1.1"
},
"idna": {
"hashes": [
@@ -299,11 +327,11 @@
},
"internetarchive": {
"hashes": [
- "sha256:6071c5be1a4f933af9e2dfa015cc0d63e79c404cfa29ae26121e54181079c947",
- "sha256:bad1c4152fb6286ce7c77737a853bb4e45bcefb89ca5834d75607419f08cb6fe"
+ "sha256:759053685c75e6e969d690043b82643c4016500abcbbc44e4daf52ec097a9a15",
+ "sha256:a20a0ace949f0f8d2257e0f416a24eb8b3deaf68fa09549a9fdb50f9ce817384"
],
"index": "ia",
- "version": "==1.9.3"
+ "version": "==1.9.5"
},
"itsdangerous": {
"hashes": [
@@ -340,6 +368,56 @@
],
"version": "==2.0"
},
+ "justext": {
+ "hashes": [
+ "sha256:330035dfaaa960465276afa1836dfb6e63791011a8dfc6da2757142cc4d14d54",
+ "sha256:4b8b7f0749e8725f0089ebe0239c1a45286d61bf507b3f05d136c2700dea4aa6"
+ ],
+ "version": "==2.2.0"
+ },
+ "lxml": {
+ "hashes": [
+ "sha256:098fb713b31050463751dcc694878e1d39f316b86366fb9fe3fbbe5396ac9fab",
+ "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b",
+ "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5",
+ "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301",
+ "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b",
+ "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d",
+ "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b",
+ "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9",
+ "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b",
+ "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311",
+ "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891",
+ "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a",
+ "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1",
+ "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856",
+ "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810",
+ "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51",
+ "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360",
+ "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4",
+ "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f",
+ "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230",
+ "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a",
+ "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f",
+ "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174",
+ "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf",
+ "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd",
+ "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3",
+ "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a",
+ "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5",
+ "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367",
+ "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c",
+ "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1",
+ "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8",
+ "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f",
+ "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc",
+ "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d",
+ "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9",
+ "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==4.6.1"
+ },
"markupsafe": {
"hashes": [
"sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
@@ -380,12 +458,12 @@
},
"minio": {
"hashes": [
- "sha256:6ecb7637a35f806733e9d112eacfa599a58d7c3d4698fda2b5c86fff5d34b417",
- "sha256:71984a47fc8268afdfd1d0ed5e45e72f45f6495591878b0eaa7f77b2503e96ab",
- "sha256:ba5978a97e3366983c8b4ea11f2ae8e1add995ab4789e0098dd2403199999ac4"
+ "sha256:63918e4d20143268e4316a079bb7da405b0b6aeeb3ca357444b3385eca16b53a",
+ "sha256:97d275ff01ddae45101eced0d9d5258f2869308c949b17d86a77b77a2a50b7b3",
+ "sha256:b74bf0466fb1038f41410ac9225d472446b7de246104c840c74bbe8a2e39eec3"
],
"index": "ia",
- "version": "==5.0.10"
+ "version": "==6.0.0"
},
"pathspec": {
"hashes": [
@@ -451,22 +529,24 @@
},
"psycopg2": {
"hashes": [
- "sha256:132efc7ee46a763e68a815f4d26223d9c679953cd190f1f218187cb60decf535",
- "sha256:2327bf42c1744a434ed8ed0bbaa9168cac7ee5a22a9001f6fc85c33b8a4a14b7",
- "sha256:27c633f2d5db0fc27b51f1b08f410715b59fa3802987aec91aeb8f562724e95c",
- "sha256:2c0afb40cfb4d53487ee2ebe128649028c9a78d2476d14a67781e45dc287f080",
- "sha256:2df2bf1b87305bd95eb3ac666ee1f00a9c83d10927b8144e8e39644218f4cf81",
- "sha256:440a3ea2c955e89321a138eb7582aa1d22fe286c7d65e26a2c5411af0a88ae72",
- "sha256:6a471d4d2a6f14c97a882e8d3124869bc623f3df6177eefe02994ea41fd45b52",
- "sha256:6b306dae53ec7f4f67a10942cf8ac85de930ea90e9903e2df4001f69b7833f7e",
- "sha256:a0984ff49e176062fcdc8a5a2a670c9bb1704a2f69548bce8f8a7bad41c661bf",
- "sha256:ac5b23d0199c012ad91ed1bbb971b7666da651c6371529b1be8cbe2a7bf3c3a9",
- "sha256:acf56d564e443e3dea152efe972b1434058244298a94348fc518d6dd6a9fb0bb",
- "sha256:d3b29d717d39d3580efd760a9a46a7418408acebbb784717c90d708c9ed5f055",
- "sha256:f7d46240f7a1ae1dd95aab38bd74f7428d46531f69219954266d669da60c0818"
+ "sha256:00195b5f6832dbf2876b8bf77f12bdce648224c89c880719c745b90515233301",
+ "sha256:068115e13c70dc5982dfc00c5d70437fe37c014c808acce119b5448361c03725",
+ "sha256:26e7fd115a6db75267b325de0fba089b911a4a12ebd3d0b5e7acb7028bc46821",
+ "sha256:2c93d4d16933fea5bbacbe1aaf8fa8c1348740b2e50b3735d1b0bf8154cbf0f3",
+ "sha256:56007a226b8e95aa980ada7abdea6b40b75ce62a433bd27cec7a8178d57f4051",
+ "sha256:56fee7f818d032f802b8eed81ef0c1232b8b42390df189cab9cfa87573fe52c5",
+ "sha256:6a3d9efb6f36f1fe6aa8dbb5af55e067db802502c55a9defa47c5a1dad41df84",
+ "sha256:a49833abfdede8985ba3f3ec641f771cca215479f41523e99dace96d5b8cce2a",
+ "sha256:ad2fe8a37be669082e61fb001c185ffb58867fdbb3e7a6b0b0d2ffe232353a3e",
+ "sha256:b8cae8b2f022efa1f011cc753adb9cbadfa5a184431d09b273fb49b4167561ad",
+ "sha256:d160744652e81c80627a909a0e808f3c6653a40af435744de037e3172cf277f5",
+ "sha256:d5062ae50b222da28253059880a871dc87e099c25cb68acf613d9d227413d6f7",
+ "sha256:f22ea9b67aea4f4a1718300908a2fb62b3e4276cf00bd829a97ab5894af42ea3",
+ "sha256:f974c96fca34ae9e4f49839ba6b78addf0346777b46c4da27a7bf54f48d3057d",
+ "sha256:fb23f6c71107c37fd667cb4ea363ddeb936b348bbd6449278eb92c189699f543"
],
"index": "ia",
- "version": "==2.8.5"
+ "version": "==2.8.6"
},
"publicsuffix": {
"hashes": [
@@ -474,6 +554,34 @@
],
"version": "==1.1.1"
},
+ "pydantic": {
+ "hashes": [
+ "sha256:01f0291f4951580f320f7ae3f2ecaf0044cdebcc9b45c5f882a7e84453362420",
+ "sha256:0fe8b45d31ae53d74a6aa0bf801587bd49970070eac6a6326f9fa2a302703b8a",
+ "sha256:2182ba2a9290964b278bcc07a8d24207de709125d520efec9ad6fa6f92ee058d",
+ "sha256:2c1673633ad1eea78b1c5c420a47cd48717d2ef214c8230d96ca2591e9e00958",
+ "sha256:388c0c26c574ff49bad7d0fd6ed82fbccd86a0473fa3900397d3354c533d6ebb",
+ "sha256:4ba6b903e1b7bd3eb5df0e78d7364b7e831ed8b4cd781ebc3c4f1077fbcb72a4",
+ "sha256:6665f7ab7fbbf4d3c1040925ff4d42d7549a8c15fe041164adfe4fc2134d4cce",
+ "sha256:95d4410c4e429480c736bba0db6cce5aaa311304aea685ebcf9ee47571bfd7c8",
+ "sha256:a2fc7bf77ed4a7a961d7684afe177ff59971828141e608f142e4af858e07dddc",
+ "sha256:a3c274c49930dc047a75ecc865e435f3df89715c775db75ddb0186804d9b04d0",
+ "sha256:ab1d5e4d8de00575957e1c982b951bffaedd3204ddd24694e3baca3332e53a23",
+ "sha256:b11fc9530bf0698c8014b2bdb3bbc50243e82a7fa2577c8cfba660bcc819e768",
+ "sha256:b9572c0db13c8658b4a4cb705dcaae6983aeb9842248b36761b3fbc9010b740f",
+ "sha256:c68b5edf4da53c98bb1ccb556ae8f655575cb2e676aef066c12b08c724a3f1a1",
+ "sha256:c8200aecbd1fb914e1bd061d71a4d1d79ecb553165296af0c14989b89e90d09b",
+ "sha256:c9760d1556ec59ff745f88269a8f357e2b7afc75c556b3a87b8dda5bc62da8ba",
+ "sha256:ce2d452961352ba229fe1e0b925b41c0c37128f08dddb788d0fd73fd87ea0f66",
+ "sha256:dfaa6ed1d509b5aef4142084206584280bb6e9014f01df931ec6febdad5b200a",
+ "sha256:e5fece30e80087d9b7986104e2ac150647ec1658c4789c89893b03b100ca3164",
+ "sha256:f045cf7afb3352a03bc6cb993578a34560ac24c5d004fa33c76efec6ada1361a",
+ "sha256:f83f679e727742b0c465e7ef992d6da4a7e5268b8edd8fdaf5303276374bef52",
+ "sha256:fc21a37ff3f545de80b166e1735c4172b41b017948a3fb2d5e2f03c219eac50a"
+ ],
+ "index": "ia",
+ "version": "==1.7.2"
+ },
"pylru": {
"hashes": [
"sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
@@ -482,10 +590,10 @@
},
"pymysql": {
"hashes": [
- "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a",
- "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"
+ "sha256:263040d2779a3b84930f7ac9da5132be0fefcd6f453a885756656103f8ee1fdd",
+ "sha256:44f47128dda8676e021c8d2dbb49a82be9e4ab158b9f03e897152a3a287c69ea"
],
- "version": "==0.9.3"
+ "version": "==0.10.1"
},
"python-dateutil": {
"hashes": [
@@ -504,10 +612,10 @@
},
"python-poppler": {
"hashes": [
- "sha256:ea1f4ce962bf0278f78414c9516ba1ab626b6ade3c2356cab61d853a5d2441b7"
+ "sha256:6843398adc9c290035646c4cf3c7bfcea9c8e04390bb9cd8fdc9bd063fb77880"
],
"index": "ia",
- "version": "==0.2.1"
+ "version": "==0.2.2"
},
"python-snappy": {
"hashes": [
@@ -524,10 +632,10 @@
},
"pytz": {
"hashes": [
- "sha256:a494d53b6d39c3c6e44c3bec237336e14305e4f29bbf800b599253057fbb79ed",
- "sha256:c35965d010ce31b23eeb663ed3cc8c906275d6be1a34393a1d73a41febf4a048"
+ "sha256:3e6b7dd2d1e0a59084bcee14a17af60c5c562cdc16d828e8eba2e683d3a7e268",
+ "sha256:5c55e189b682d420be27c6995ba6edce0c0a77dd67bfbe2ae6607134d5851ffd"
],
- "version": "==2020.1"
+ "version": "==2020.4"
},
"pyyaml": {
"hashes": [
@@ -546,6 +654,9 @@
"version": "==5.3.1"
},
"raven": {
+ "extras": [
+ "flask"
+ ],
"hashes": [
"sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
"sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
@@ -553,6 +664,13 @@
"index": "ia",
"version": "==6.10.0"
},
+ "readability-lxml": {
+ "hashes": [
+ "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305",
+ "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"
+ ],
+ "version": "==0.8.1"
+ },
"redis": {
"hashes": [
"sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
@@ -562,29 +680,51 @@
},
"regex": {
"hashes": [
- "sha256:08997a37b221a3e27d68ffb601e45abfb0093d39ee770e4257bd2f5115e8cb0a",
- "sha256:112e34adf95e45158c597feea65d06a8124898bdeac975c9087fe71b572bd938",
- "sha256:1700419d8a18c26ff396b3b06ace315b5f2a6e780dad387e4c48717a12a22c29",
- "sha256:2f6f211633ee8d3f7706953e9d3edc7ce63a1d6aad0be5dcee1ece127eea13ae",
- "sha256:52e1b4bef02f4040b2fd547357a170fc1146e60ab310cdbdd098db86e929b387",
- "sha256:55b4c25cbb3b29f8d5e63aeed27b49fa0f8476b0d4e1b3171d85db891938cc3a",
- "sha256:5aaa5928b039ae440d775acea11d01e42ff26e1561c0ffcd3d805750973c6baf",
- "sha256:654cb773b2792e50151f0e22be0f2b6e1c3a04c5328ff1d9d59c0398d37ef610",
- "sha256:690f858d9a94d903cf5cada62ce069b5d93b313d7d05456dbcd99420856562d9",
- "sha256:6ad8663c17db4c5ef438141f99e291c4d4edfeaacc0ce28b5bba2b0bf273d9b5",
- "sha256:89cda1a5d3e33ec9e231ece7307afc101b5217523d55ef4dc7fb2abd6de71ba3",
- "sha256:92d8a043a4241a710c1cf7593f5577fbb832cf6c3a00ff3fc1ff2052aff5dd89",
- "sha256:95fa7726d073c87141f7bbfb04c284901f8328e2d430eeb71b8ffdd5742a5ded",
- "sha256:97712e0d0af05febd8ab63d2ef0ab2d0cd9deddf4476f7aa153f76feef4b2754",
- "sha256:b2ba0f78b3ef375114856cbdaa30559914d081c416b431f2437f83ce4f8b7f2f",
- "sha256:bae83f2a56ab30d5353b47f9b2a33e4aac4de9401fb582b55c42b132a8ac3868",
- "sha256:c78e66a922de1c95a208e4ec02e2e5cf0bb83a36ceececc10a72841e53fbf2bd",
- "sha256:cf59bbf282b627130f5ba68b7fa3abdb96372b24b66bdf72a4920e8153fc7910",
- "sha256:e3cdc9423808f7e1bb9c2e0bdb1c9dc37b0607b30d646ff6faf0d4e41ee8fee3",
- "sha256:e9b64e609d37438f7d6e68c2546d2cb8062f3adb27e6336bc129b51be20773ac",
- "sha256:fbff901c54c22425a5b809b914a3bfaf4b9570eee0e5ce8186ac71eb2025191c"
- ],
- "version": "==2020.6.8"
+ "sha256:03855ee22980c3e4863dc84c42d6d2901133362db5daf4c36b710dd895d78f0a",
+ "sha256:06b52815d4ad38d6524666e0d50fe9173533c9cc145a5779b89733284e6f688f",
+ "sha256:11116d424734fe356d8777f89d625f0df783251ada95d6261b4c36ad27a394bb",
+ "sha256:119e0355dbdd4cf593b17f2fc5dbd4aec2b8899d0057e4957ba92f941f704bf5",
+ "sha256:127a9e0c0d91af572fbb9e56d00a504dbd4c65e574ddda3d45b55722462210de",
+ "sha256:1ec66700a10e3c75f1f92cbde36cca0d3aaee4c73dfa26699495a3a30b09093c",
+ "sha256:227a8d2e5282c2b8346e7f68aa759e0331a0b4a890b55a5cfbb28bd0261b84c0",
+ "sha256:2564def9ce0710d510b1fc7e5178ce2d20f75571f788b5197b3c8134c366f50c",
+ "sha256:297116e79074ec2a2f885d22db00ce6e88b15f75162c5e8b38f66ea734e73c64",
+ "sha256:2dc522e25e57e88b4980d2bdd334825dbf6fa55f28a922fc3bfa60cc09e5ef53",
+ "sha256:3a5f08039eee9ea195a89e180c5762bfb55258bfb9abb61a20d3abee3b37fd12",
+ "sha256:3dfca201fa6b326239e1bccb00b915e058707028809b8ecc0cf6819ad233a740",
+ "sha256:49461446b783945597c4076aea3f49aee4b4ce922bd241e4fcf62a3e7c61794c",
+ "sha256:4afa350f162551cf402bfa3cd8302165c8e03e689c897d185f16a167328cc6dd",
+ "sha256:4b5a9bcb56cc146c3932c648603b24514447eafa6ce9295234767bf92f69b504",
+ "sha256:52e83a5f28acd621ba8e71c2b816f6541af7144b69cc5859d17da76c436a5427",
+ "sha256:625116aca6c4b57c56ea3d70369cacc4d62fead4930f8329d242e4fe7a58ce4b",
+ "sha256:654c1635f2313d0843028487db2191530bca45af61ca85d0b16555c399625b0e",
+ "sha256:8092a5a06ad9a7a247f2a76ace121183dc4e1a84c259cf9c2ce3bbb69fac3582",
+ "sha256:832339223b9ce56b7b15168e691ae654d345ac1635eeb367ade9ecfe0e66bee0",
+ "sha256:8ca9dca965bd86ea3631b975d63b0693566d3cc347e55786d5514988b6f5b84c",
+ "sha256:96f99219dddb33e235a37283306834700b63170d7bb2a1ee17e41c6d589c8eb9",
+ "sha256:9b6305295b6591e45f069d3553c54d50cc47629eb5c218aac99e0f7fafbf90a1",
+ "sha256:a62162be05edf64f819925ea88d09d18b09bebf20971b363ce0c24e8b4aa14c0",
+ "sha256:aacc8623ffe7999a97935eeabbd24b1ae701d08ea8f874a6ff050e93c3e658cf",
+ "sha256:b45bab9f224de276b7bc916f6306b86283f6aa8afe7ed4133423efb42015a898",
+ "sha256:b88fa3b8a3469f22b4f13d045d9bd3eda797aa4e406fde0a2644bc92bbdd4bdd",
+ "sha256:b8a686a6c98872007aa41fdbb2e86dc03b287d951ff4a7f1da77fb7f14113e4d",
+ "sha256:bd904c0dec29bbd0769887a816657491721d5f545c29e30fd9d7a1a275dc80ab",
+ "sha256:bf4f896c42c63d1f22039ad57de2644c72587756c0cfb3cc3b7530cfe228277f",
+ "sha256:c13d311a4c4a8d671f5860317eb5f09591fbe8259676b86a85769423b544451e",
+ "sha256:c2c6c56ee97485a127555c9595c069201b5161de9d05495fbe2132b5ac104786",
+ "sha256:c32c91a0f1ac779cbd73e62430de3d3502bbc45ffe5bb6c376015acfa848144b",
+ "sha256:c3466a84fce42c2016113101018a9981804097bacbab029c2d5b4fcb224b89de",
+ "sha256:c454ad88e56e80e44f824ef8366bb7e4c3def12999151fd5c0ea76a18fe9aa3e",
+ "sha256:c8a2b7ccff330ae4c460aff36626f911f918555660cc28163417cb84ffb25789",
+ "sha256:cb905f3d2e290a8b8f1579d3984f2cfa7c3a29cc7cba608540ceeed18513f520",
+ "sha256:cfcf28ed4ce9ced47b9b9670a4f0d3d3c0e4d4779ad4dadb1ad468b097f808aa",
+ "sha256:dd3e6547ecf842a29cf25123fbf8d2461c53c8d37aa20d87ecee130c89b7079b",
+ "sha256:de7fd57765398d141949946c84f3590a68cf5887dac3fc52388df0639b01eda4",
+ "sha256:ea37320877d56a7f0a1e6a625d892cf963aa7f570013499f5b8d5ab8402b5625",
+ "sha256:f1fce1e4929157b2afeb4bb7069204d4370bab9f4fc03ca1fbec8bd601f8c87d",
+ "sha256:f43109822df2d3faac7aad79613f5f02e4eab0fc8ad7932d2e70e2a83bd49c26"
+ ],
+ "version": "==2020.10.28"
},
"requests": {
"hashes": [
@@ -628,6 +768,65 @@
],
"version": "==0.7.2"
},
+ "selectolax": {
+ "hashes": [
+ "sha256:005b5bd8fbf01f16a43a10f0cd5da41c9868c88e17bc8fa3dc1ce756913599ec",
+ "sha256:08d26cc587b65bbaf8ee02c59ed701442b4cc885b8efd748d0cebfd71a6adf17",
+ "sha256:09145e3fa548264f1f3a3fdc4183d1d89995506b11d87ce193409c5e2f69c354",
+ "sha256:131e23982b3d0ab7fe8da6ead3d0d9958d8a8fe8526159fcf15150aba9d68782",
+ "sha256:1b399833418ec12b0cbd38b09e25659dec35295c7481f5c53736af0efe57abb4",
+ "sha256:2270e509e5f6f2af42b939e87196451fdeed661a6bf2183fef8125a1637b6432",
+ "sha256:265fedbda6ce2793c6aaae0f251e59fd657478b64168df223319c46d6e13dd98",
+ "sha256:30dbe35e1a21890eb3bc5bf78a92623c61315bf16eac7539932a7a559397c2e8",
+ "sha256:327b538e22e1af44a09a4a5844644cb149650067d867528f164407a7e60f70af",
+ "sha256:3a26be6c3bb52dd11beb21ddcdf5f459ca7811d38d115df0425716777bd42148",
+ "sha256:47f89319f77a49162a63a72a998b37222a0fe9ef4239e9ec93f49f3bd1812f99",
+ "sha256:4e99ab32d443f0fd3f4880cda397829065ec875d6f82e4877d19bbf4de5ea44d",
+ "sha256:536a2fa2f59ba708c5e3f32f71eee9a36fdc9355ee76ef94d2f74f196a81b3d1",
+ "sha256:557976a2136ade40eb44795d9e59c26db5dd2e756b4e11d7e290ad16e08c59d7",
+ "sha256:5c8e752136b98d01e255def86e35b280938dbc6872794e87e241245a82874549",
+ "sha256:676b676fbabb623e7f1f478c0291b5892c585cc7a2cd310963dbaf2aa602f113",
+ "sha256:6c4418e6b6ad42c3e0de356a398756a616b78e14c2151b0df9e09c6ab779b456",
+ "sha256:70d25eceb4c46cf5e56a328b7d82fa5a4657ab5425f8005a147214f97a2be28a",
+ "sha256:75881a3ce1363c6a502a0c5288318dcf751c81beac7d0428f9f90f8152b2cdfa",
+ "sha256:761c975d2baf78f34fc972b8659b02e927719f01bf77cdcfc71e2cd42fcf566e",
+ "sha256:91d514ee2c483a5952d427ebf56f606d3fe542c4b0e2a0ed09ad2362d63fe8ee",
+ "sha256:97fe610e863bb7c723d79a5a303e28fbb614b8208269378a66f51d657ccb2924",
+ "sha256:98ad70bc574c9d820e5cdb1561f4c382a291603159b916bb64f5590a0d6817f6",
+ "sha256:9ac44a41af4b7b8aa1a987d23384c573fbd66b3e3c2b4c5d13958df167bd241d",
+ "sha256:9c3655eba589d8b0e48ada55103b0999e2a138546c7d87c4a145943cb76cd484",
+ "sha256:9dcb4c8eb9abf21d61c2d157ce63f083bd8c8eac2e7dbe8f4f719092efccce7b",
+ "sha256:a85c08238a3ecf31aa683aa408f4ec0cfe27e2c3066b62f94d03a79f118d796b",
+ "sha256:b3df1aacac8abfdb8a48903af270cc8dadb267bbcba4a1619509550c5621d2e8",
+ "sha256:b516b4f2cfea6a7d8b1ce1f6018f39ed84088c83c842afbeed4840b9ff0240a1",
+ "sha256:b6db30c9e1d09091d5c2ba7840533131f3c82dd6ddbdfad6858f2148659b5ce1",
+ "sha256:b7faa28d9a40559bcfa7a3d23505dd6c53de32c5c18a83086a695f47b37f0d4c",
+ "sha256:b9b5ae1ec3fdf7488b0bff2c2023e68a3ecb2e964f91b54ff1feb8b29858123f",
+ "sha256:bce0c91cb2c67c1028e26855f1399b093eed178dcb15d42732c976d62b0a57f3",
+ "sha256:c1381988ea2c9874dd75947f23d001127c2c362c6e89f97e93e9a076d948fed5",
+ "sha256:c2d2ee9ed63953f72ce837a8b299b2fe209662bf538290b556b5824ba9b50882",
+ "sha256:c313b18e95453dfb8ebad64230794ad3ffe8a0e2e0e7a4bb518355f1b4b9610b",
+ "sha256:d0aed68bb6e400d9ccb9b39c6b93ede5418bafdbfa4d755e69612efe58a82e65",
+ "sha256:d8d14dcce878395a4f28d9735504506f04b61599b123bd25a767f42d7f3c7c10",
+ "sha256:dd2a3e81f2c63076a1257d87ab4bd5c43a1ae68f1d5d5a8aea20959c1f5aa7e7",
+ "sha256:e0edc945fbb7013bbe7211297e1dcdbc1fe84fb20e6271e7dde2b388873cb205",
+ "sha256:e27a6088d0eea0bb9fb626d3cb61bbf747379eff7adb498189115d1270902a97",
+ "sha256:e38b68695e26c4122368d7d8300f03c501ddf342b55fa77a33680c3d1a136e3a",
+ "sha256:f34ae8ff9dd0f99cf704d70319530476afb5425fd0d744630af9bf974f056dca",
+ "sha256:f4413d61d83bd6eaf914680a08c5506ef1bd0736dd08ca9b5a6fb39638d3fe01"
+ ],
+ "index": "ia",
+ "version": "==0.2.9"
+ },
+ "sentry-sdk": {
+ "extras": [],
+ "hashes": [
+ "sha256:17b725df2258354ccb39618ae4ead29651aa92c01a92acf72f98efe06ee2e45a",
+ "sha256:9040539485226708b5cad0401d76628fba4eed9154bf301c50579767afe344fd"
+ ],
+ "index": "ia",
+ "version": "==0.19.2"
+ },
"six": {
"hashes": [
"sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
@@ -640,40 +839,51 @@
"sha256:1634eea42ab371d3d346309b93df7870a88610f0725d47528be902a0d95ecc55",
"sha256:a59dc181727e95d25f781f0eb4fd1825ff45590ec8ff49eadfd7f1a537cc0232"
],
+ "markers": "python_version >= '3.0'",
"version": "==2.0.1"
},
"sqlalchemy": {
"hashes": [
- "sha256:128bc917ed20d78143a45024455ff0aed7d3b96772eba13d5dbaf9cc57e5c41b",
- "sha256:156a27548ba4e1fed944ff9fcdc150633e61d350d673ae7baaf6c25c04ac1f71",
- "sha256:27e2efc8f77661c9af2681755974205e7462f1ae126f498f4fe12a8b24761d15",
- "sha256:2a12f8be25b9ea3d1d5b165202181f2b7da4b3395289000284e5bb86154ce87c",
- "sha256:31c043d5211aa0e0773821fcc318eb5cbe2ec916dfbc4c6eea0c5188971988eb",
- "sha256:65eb3b03229f684af0cf0ad3bcc771970c1260a82a791a8d07bffb63d8c95bcc",
- "sha256:6cd157ce74a911325e164441ff2d9b4e244659a25b3146310518d83202f15f7a",
- "sha256:703c002277f0fbc3c04d0ae4989a174753a7554b2963c584ce2ec0cddcf2bc53",
- "sha256:869bbb637de58ab0a912b7f20e9192132f9fbc47fc6b5111cd1e0f6cdf5cf9b0",
- "sha256:8a0e0cd21da047ea10267c37caf12add400a92f0620c8bc09e4a6531a765d6d7",
- "sha256:8d01e949a5d22e5c4800d59b50617c56125fc187fbeb8fa423e99858546de616",
- "sha256:925b4fe5e7c03ed76912b75a9a41dfd682d59c0be43bce88d3b27f7f5ba028fb",
- "sha256:9cb1819008f0225a7c066cac8bb0cf90847b2c4a6eb9ebb7431dbd00c56c06c5",
- "sha256:a87d496884f40c94c85a647c385f4fd5887941d2609f71043e2b73f2436d9c65",
- "sha256:a9030cd30caf848a13a192c5e45367e3c6f363726569a56e75dc1151ee26d859",
- "sha256:a9e75e49a0f1583eee0ce93270232b8e7bb4b1edc89cc70b07600d525aef4f43",
- "sha256:b50f45d0e82b4562f59f0e0ca511f65e412f2a97d790eea5f60e34e5f1aabc9a",
- "sha256:b7878e59ec31f12d54b3797689402ee3b5cfcb5598f2ebf26491732758751908",
- "sha256:ce1ddaadee913543ff0154021d31b134551f63428065168e756d90bdc4c686f5",
- "sha256:ce2646e4c0807f3461be0653502bb48c6e91a5171d6e450367082c79e12868bf",
- "sha256:ce6c3d18b2a8ce364013d47b9cad71db815df31d55918403f8db7d890c9d07ae",
- "sha256:e4e2664232005bd306f878b0f167a31f944a07c4de0152c444f8c61bbe3cfb38",
- "sha256:e8aa395482728de8bdcca9cc0faf3765ab483e81e01923aaa736b42f0294f570",
- "sha256:eb4fcf7105bf071c71068c6eee47499ab8d4b8f5a11fc35147c934f0faa60f23",
- "sha256:ed375a79f06cad285166e5be74745df1ed6845c5624aafadec4b7a29c25866ef",
- "sha256:f35248f7e0d63b234a109dd72fbfb4b5cb6cb6840b221d0df0ecbf54ab087654",
- "sha256:f502ef245c492b391e0e23e94cba030ab91722dcc56963c85bfd7f3441ea2bbe",
- "sha256:fe01bac7226499aedf472c62fa3b85b2c619365f3f14dd222ffe4f3aa91e5f98"
- ],
- "version": "==1.3.17"
+ "sha256:009e8388d4d551a2107632921320886650b46332f61dc935e70c8bcf37d8e0d6",
+ "sha256:0157c269701d88f5faf1fa0e4560e4d814f210c01a5b55df3cab95e9346a8bcc",
+ "sha256:0a92745bb1ebbcb3985ed7bda379b94627f0edbc6c82e9e4bac4fb5647ae609a",
+ "sha256:0cca1844ba870e81c03633a99aa3dc62256fb96323431a5dec7d4e503c26372d",
+ "sha256:166917a729b9226decff29416f212c516227c2eb8a9c9f920d69ced24e30109f",
+ "sha256:1f5f369202912be72fdf9a8f25067a5ece31a2b38507bb869306f173336348da",
+ "sha256:2909dffe5c9a615b7e6c92d1ac2d31e3026dc436440a4f750f4749d114d88ceb",
+ "sha256:2b5dafed97f778e9901b79cc01b88d39c605e0545b4541f2551a2fd785adc15b",
+ "sha256:2e9bd5b23bba8ae8ce4219c9333974ff5e103c857d9ff0e4b73dc4cb244c7d86",
+ "sha256:3aa6d45e149a16aa1f0c46816397e12313d5e37f22205c26e06975e150ffcf2a",
+ "sha256:4bdbdb8ca577c6c366d15791747c1de6ab14529115a2eb52774240c412a7b403",
+ "sha256:53fd857c6c8ffc0aa6a5a3a2619f6a74247e42ec9e46b836a8ffa4abe7aab327",
+ "sha256:5cdfe54c1e37279dc70d92815464b77cd8ee30725adc9350f06074f91dbfeed2",
+ "sha256:5d92c18458a4aa27497a986038d5d797b5279268a2de303cd00910658e8d149c",
+ "sha256:632b32183c0cb0053194a4085c304bc2320e5299f77e3024556fa2aa395c2a8b",
+ "sha256:7c735c7a6db8ee9554a3935e741cf288f7dcbe8706320251eb38c412e6a4281d",
+ "sha256:7cd40cb4bc50d9e87b3540b23df6e6b24821ba7e1f305c1492b0806c33dbdbec",
+ "sha256:84f0ac4a09971536b38cc5d515d6add7926a7e13baa25135a1dbb6afa351a376",
+ "sha256:8dcbf377529a9af167cbfc5b8acec0fadd7c2357fc282a1494c222d3abfc9629",
+ "sha256:950f0e17ffba7a7ceb0dd056567bc5ade22a11a75920b0e8298865dc28c0eff6",
+ "sha256:9e379674728f43a0cd95c423ac0e95262500f9bfd81d33b999daa8ea1756d162",
+ "sha256:b15002b9788ffe84e42baffc334739d3b68008a973d65fad0a410ca5d0531980",
+ "sha256:b6f036ecc017ec2e2cc2a40615b41850dc7aaaea6a932628c0afc73ab98ba3fb",
+ "sha256:bad73f9888d30f9e1d57ac8829f8a12091bdee4949b91db279569774a866a18e",
+ "sha256:bbc58fca72ce45a64bb02b87f73df58e29848b693869e58bd890b2ddbb42d83b",
+ "sha256:bca4d367a725694dae3dfdc86cf1d1622b9f414e70bd19651f5ac4fb3aa96d61",
+ "sha256:be41d5de7a8e241864189b7530ca4aaf56a5204332caa70555c2d96379e18079",
+ "sha256:bf53d8dddfc3e53a5bda65f7f4aa40fae306843641e3e8e701c18a5609471edf",
+ "sha256:c092fe282de83d48e64d306b4bce03114859cdbfe19bf8a978a78a0d44ddadb1",
+ "sha256:c3ab23ee9674336654bf9cac30eb75ac6acb9150dc4b1391bec533a7a4126471",
+ "sha256:ce64a44c867d128ab8e675f587aae7f61bd2db836a3c4ba522d884cd7c298a77",
+ "sha256:d05cef4a164b44ffda58200efcb22355350979e000828479971ebca49b82ddb1",
+ "sha256:d2f25c7f410338d31666d7ddedfa67570900e248b940d186b48461bd4e5569a1",
+ "sha256:d3b709d64b5cf064972b3763b47139e4a0dc4ae28a36437757f7663f67b99710",
+ "sha256:e32e3455db14602b6117f0f422f46bc297a3853ae2c322ecd1e2c4c04daf6ed5",
+ "sha256:ed53209b5f0f383acb49a927179fa51a6e2259878e164273ebc6815f3a752465",
+ "sha256:f605f348f4e6a2ba00acb3399c71d213b92f27f2383fc4abebf7a37368c12142",
+ "sha256:fcdb3755a7c355bc29df1b5e6fb8226d5c8b90551d202d69d0076a8a5649d68b"
+ ],
+ "version": "==1.3.20"
},
"surt": {
"hashes": [
@@ -684,30 +894,32 @@
},
"tldextract": {
"hashes": [
- "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7",
- "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808"
+ "sha256:d2762b1aa2a36857df8420d63c2c31706e4924da8773439a543365d38459afd8",
+ "sha256:f188eab8c90ff935f3fa49d9228049cd7f37fb47105c3f15f8e6dd6f6e25924a"
],
- "version": "==2.2.2"
+ "version": "==3.0.2"
},
"toml": {
"hashes": [
- "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f",
- "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
- "version": "==0.10.1"
+ "version": "==0.10.2"
},
- "total-ordering": {
+ "tqdm": {
"hashes": [
- "sha256:a14a2a138a52befaa02b3fd53eb3366f66da69020be299af3cf0b54c9441aacc"
+ "sha256:9ad44aaf0fc3697c06f6e05c7cf025dd66bc7bcb7613c66d85f4464c47ac8fad",
+ "sha256:ef54779f1c09f346b2b5a8e5c61f96fbcb639929e640e59f8cf810794f406432"
],
- "version": "==0.1.0"
+ "version": "==4.51.0"
},
- "tqdm": {
+ "trafilatura": {
"hashes": [
- "sha256:07c06493f1403c1380b630ae3dcbe5ae62abcf369a93bbc052502279f189ab8c",
- "sha256:cd140979c2bebd2311dfb14781d8f19bd5a9debb92dcab9f6ef899c987fcf71f"
+ "sha256:7881c889dbafb489af8c994c1ce5ed0885feeb1bbad895fbaab857fdf4d3bd0a",
+ "sha256:8ade07cc2ea61e7e6c23de92c1ba151561257fb22b6d17ed76aa1e5626ffc70f"
],
- "version": "==4.46.1"
+ "index": "ia",
+ "version": "==0.5.2"
},
"twitter": {
"hashes": [
@@ -720,28 +932,44 @@
"hashes": [
"sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
"sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+ "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
"sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
"sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
"sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+ "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
"sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
"sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
"sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
"sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
"sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
"sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+ "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
"sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
"sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+ "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
"sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+ "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
"sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
"sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
"sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
"sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
"sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+ "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
+ "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
+ "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
"sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+ "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
"sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
],
"version": "==1.4.1"
},
+ "tzlocal": {
+ "hashes": [
+ "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
+ "sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4"
+ ],
+ "version": "==2.1"
+ },
"urlcanon": {
"hashes": [
"sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62"
@@ -751,11 +979,11 @@
},
"urllib3": {
"hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
+ "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"
],
"markers": "python_version != '3.4'",
- "version": "==1.22"
+ "version": "==1.25.11"
},
"warctools": {
"hashes": [
@@ -764,29 +992,32 @@
"version": "==4.10.0"
},
"wayback": {
+ "extras": [
+ "brotli"
+ ],
"hashes": [
- "sha256:a761515f81d4bcfa543ddb7fbe2b584508212735c438a269e86d4196015b4d6f"
+ "sha256:2cab62123044fcedadfa95640bfc539dcc96548ec7521a7b9ed29cacda216486"
],
"index": "ia",
- "version": "==0.6.1"
+ "version": "==0.6.3"
},
"wayback-esp": {
"hashes": [
- "sha256:283c1d38712dbf019ade15f5ffe8cf740951201a6a7cb1b9c98c3e84adb8b2f0"
+ "sha256:0b137b14a52768f621454b9ec5ca733136095421531ea15f32eaf94c1d146f1b"
],
- "version": "==0.2.8"
+ "version": "==0.2.10"
},
"wayback-search-js": {
"hashes": [
- "sha256:ae83f2719b0737d173c0a91ef13e9cfcd4d2f64bca8c00719f1977bbe5f864e2"
+ "sha256:e53f5fb1e3ed304eb3d6de89b0d263cef1b3cac9af6cccd5bea7fe8d6a97784a"
],
- "version": "==2.12.3"
+ "version": "==2.12.7"
},
"wbex-client": {
"hashes": [
- "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
+ "sha256:619ead0408195f4eb87198a99e497c649961da45fcf97cb9bc937ef9e06a9e7f"
],
- "version": "==0.1.5"
+ "version": "==0.1.6"
},
"wcwidth": {
"hashes": [
@@ -801,6 +1032,60 @@
"sha256:6c80b1e5ad3665290ea39320b91e1be1e0d5f60652b964a3070216de83d2e47c"
],
"version": "==1.0.1"
+ },
+ "zstandard": {
+ "hashes": [
+ "sha256:0646bd506cd1c83b94a5057568cbc7868f656c79ac22d2e19e9d280f64451a0c",
+ "sha256:0c3ea262cee9c8a624ae22760466a8144c3c2b62da6f2b2671f47d9f74d8315f",
+ "sha256:22362a1b5bf8693692be1d1609a25159cd67d5ff93200a2978aea815a63739e8",
+ "sha256:25ec0734f8c2eee8fd140cae3cde0ffc531ab6730be1f48b2b868a409a1a233d",
+ "sha256:2e66459d260d2332c5044625dc9f50ef883fe4366c15915d4d0deedb3b1dcba6",
+ "sha256:2f491936999f43301c424aaa9e03461ea218d9bb8574c1672a09260d30a4096e",
+ "sha256:39339ed8e0351e3a1d9e0792c5a77ac7da2091279dd78f3458d456bdc3cbb25e",
+ "sha256:3b41598ffc3cb3497bd6019aeeb1a55e272d3106f15d7855339eab92ed7659e8",
+ "sha256:4286cd5d76c9a2bf7cb9f9065c8f68b12221ddbcfba754577692442dce563995",
+ "sha256:45a3b64812152bf188044a1170bcaaeaee2175ec5340ea6a6810bf94b088886e",
+ "sha256:45e96e1b3bcf8f1060fad174938bfc9825f5d864ddc717b3dda1d876ab59eaaf",
+ "sha256:50f7692f32ebd86b87133f25211850f5025e730f75b364dfaab30e817a7780a1",
+ "sha256:6525190e90d49e07c88f88ee7cf02e1af76f9bf32a693e8dd6b8a5fe01b65079",
+ "sha256:68840f8117d087ecb82c2dfb7f32de237261220a569ea93a8bc0afeffb03ab58",
+ "sha256:68d15b407ac1f18e03fb89c93ade275cca766cb7eff03b26b40fdf9dba100679",
+ "sha256:754bcb077e2f946868e77670fb59907ac291542a14c836f89716376cd099107c",
+ "sha256:83f81d7c2e45e65654ea881683e7e597e813a862ba8e0596945de46657fbc285",
+ "sha256:85f59177e6a3cab285471a0e7ce048d07f6d39080b9766f8eaaf274f979f0afc",
+ "sha256:86494400d3923917124bd5f50b8e096de1dd7cfd890b164253bcd2283ef19539",
+ "sha256:8cb4cd3bb2e7213dd09432f8182d9acc8997bcd34fa3be44dffbb3f82d8d6dfd",
+ "sha256:9052398da52e8702cf9929999c8986b0f68b18c793e309cd8dff5cb7863d7652",
+ "sha256:9052870eeebbf4787fc9fc20703d16b6c32b4fffa1446045d05c64a8cb34f614",
+ "sha256:9119a52758dce523e82318433d41bc8053051af6d7dadd2ff3ada24d1cbf28cf",
+ "sha256:9572d3047579220f950e7fd6af647cc95e361dc671d10ad63215e07f147eec31",
+ "sha256:9d7d49b2d46233280c0a0d27046ab9321ceae329c4cbe8cffddfebb53dff3da2",
+ "sha256:a012f237fa5b00708f00e362035c032d1af5536796f9b410e76e61722176f607",
+ "sha256:a1ea3108dde195f9fb18fe99ee1674f85a99056793d2ea72fb3965eb48a0bd8f",
+ "sha256:a79db6a7db4ff91e7c5238d020d85aee1f4849ea357236899f9ed1773c5b66b4",
+ "sha256:a927f60735fcb5c19586c846c5f28da5edf8549142e4dd62ddf4b9579800a23c",
+ "sha256:ae4cfd9e023702609c59f5535d95d7b19d54d42902514fe4ece8792b65b3a0af",
+ "sha256:b021d3321107cdeba427a514d4faa35429525192e902e5b6608f346ef5ba5c8a",
+ "sha256:b3ac3401ae1945f3dab138819f58830fd658410aa2a53583c0a9af3e8809117d",
+ "sha256:b637e58757a9153ad562b530b82140dad5e505ae14d806b264a0802f343bd5dd",
+ "sha256:b711ee17b8676f367282ee654b8de750e2dfa2262e2eb07b7178b1524a273d44",
+ "sha256:b7e51d0d48153ece2db2c4e6bb2a71e781879027201dc7b718b3f27130547410",
+ "sha256:b8a1986ba41f6cf61f1234779ed492d026f87ab327cc6bf9e82d2e7a3f0b5b9c",
+ "sha256:c9da20d5e16f246861158b15cc908797ee6ceb5a799c8a3b97fe6c665627f0e5",
+ "sha256:dd156961934f7869aecfdf68da6f3f0fa48ad01923d64e9662038dff83f314d4",
+ "sha256:e149711b256fa8facbbce09b503a744c10fc03325742a9399c69c8569f0e9fe8",
+ "sha256:ece7f7ec03997357d61c44c50e6543123c0b7c2bdedc972b165d6832bf8868ad",
+ "sha256:ef36cb399ebc0941f68a4d3a675b13ad75a6037270ec3915ee337227b8bfec90",
+ "sha256:f1bfdbb37ada30bf6a08671a530e46ab24426bfad61efd28e5dc2beeb4f5b78d",
+ "sha256:f1c25e52e963dbe23a3ebc79ab904705eddcc15e14093fcde5059251090f01a6",
+ "sha256:f532d4c65c6ed6202b2c8bfc166648ec2c2ec2dc1d0fb06de643e87ce0a222c8",
+ "sha256:f559281d181c30ba14f0446a9e1a1ea6c4980792d7249bacbc575fcbcebde4b3",
+ "sha256:f5eccca127169257d8356069d298701fc612b05f6b768aa9ffc6e652c5169bd6",
+ "sha256:fa660370fe5b5e4f3c3952732aea358540e56e91c9233d55a6b6e508e047b315",
+ "sha256:fff79a30845c2591718cb8798196d117402b2d5d7506b5f3bb691972731c30b3"
+ ],
+ "index": "ia",
+ "version": "==0.14.0"
}
},
"develop": {
@@ -813,10 +1098,10 @@
},
"attrs": {
"hashes": [
- "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
- "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
+ "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594",
+ "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc"
],
- "version": "==19.3.0"
+ "version": "==20.2.0"
},
"backcall": {
"hashes": [
@@ -841,39 +1126,42 @@
},
"coverage": {
"hashes": [
- "sha256:00f1d23f4336efc3b311ed0d807feb45098fc86dee1ca13b3d6768cdab187c8a",
- "sha256:01333e1bd22c59713ba8a79f088b3955946e293114479bbfc2e37d522be03355",
- "sha256:0cb4be7e784dcdc050fc58ef05b71aa8e89b7e6636b99967fadbdba694cf2b65",
- "sha256:0e61d9803d5851849c24f78227939c701ced6704f337cad0a91e0972c51c1ee7",
- "sha256:1601e480b9b99697a570cea7ef749e88123c04b92d84cedaa01e117436b4a0a9",
- "sha256:2742c7515b9eb368718cd091bad1a1b44135cc72468c731302b3d641895b83d1",
- "sha256:2d27a3f742c98e5c6b461ee6ef7287400a1956c11421eb574d843d9ec1f772f0",
- "sha256:402e1744733df483b93abbf209283898e9f0d67470707e3c7516d84f48524f55",
- "sha256:5c542d1e62eece33c306d66fe0a5c4f7f7b3c08fecc46ead86d7916684b36d6c",
- "sha256:5f2294dbf7875b991c381e3d5af2bcc3494d836affa52b809c91697449d0eda6",
- "sha256:6402bd2fdedabbdb63a316308142597534ea8e1895f4e7d8bf7476c5e8751fef",
- "sha256:66460ab1599d3cf894bb6baee8c684788819b71a5dc1e8fa2ecc152e5d752019",
- "sha256:782caea581a6e9ff75eccda79287daefd1d2631cc09d642b6ee2d6da21fc0a4e",
- "sha256:79a3cfd6346ce6c13145731d39db47b7a7b859c0272f02cdb89a3bdcbae233a0",
- "sha256:7a5bdad4edec57b5fb8dae7d3ee58622d626fd3a0be0dfceda162a7035885ecf",
- "sha256:8fa0cbc7ecad630e5b0f4f35b0f6ad419246b02bc750de7ac66db92667996d24",
- "sha256:a027ef0492ede1e03a8054e3c37b8def89a1e3c471482e9f046906ba4f2aafd2",
- "sha256:a3f3654d5734a3ece152636aad89f58afc9213c6520062db3978239db122f03c",
- "sha256:a82b92b04a23d3c8a581fc049228bafde988abacba397d57ce95fe95e0338ab4",
- "sha256:acf3763ed01af8410fc36afea23707d4ea58ba7e86a8ee915dfb9ceff9ef69d0",
- "sha256:adeb4c5b608574a3d647011af36f7586811a2c1197c861aedb548dd2453b41cd",
- "sha256:b83835506dfc185a319031cf853fa4bb1b3974b1f913f5bb1a0f3d98bdcded04",
- "sha256:bb28a7245de68bf29f6fb199545d072d1036a1917dca17a1e75bbb919e14ee8e",
- "sha256:bf9cb9a9fd8891e7efd2d44deb24b86d647394b9705b744ff6f8261e6f29a730",
- "sha256:c317eaf5ff46a34305b202e73404f55f7389ef834b8dbf4da09b9b9b37f76dd2",
- "sha256:dbe8c6ae7534b5b024296464f387d57c13caa942f6d8e6e0346f27e509f0f768",
- "sha256:de807ae933cfb7f0c7d9d981a053772452217df2bf38e7e6267c9cbf9545a796",
- "sha256:dead2ddede4c7ba6cb3a721870f5141c97dc7d85a079edb4bd8d88c3ad5b20c7",
- "sha256:dec5202bfe6f672d4511086e125db035a52b00f1648d6407cc8e526912c0353a",
- "sha256:e1ea316102ea1e1770724db01998d1603ed921c54a86a2efcb03428d5417e489",
- "sha256:f90bfc4ad18450c80b024036eaf91e4a246ae287701aaa88eaebebf150868052"
- ],
- "version": "==5.1"
+ "sha256:0203acd33d2298e19b57451ebb0bed0ab0c602e5cf5a818591b4918b1f97d516",
+ "sha256:0f313707cdecd5cd3e217fc68c78a960b616604b559e9ea60cc16795c4304259",
+ "sha256:1c6703094c81fa55b816f5ae542c6ffc625fec769f22b053adb42ad712d086c9",
+ "sha256:1d44bb3a652fed01f1f2c10d5477956116e9b391320c94d36c6bf13b088a1097",
+ "sha256:280baa8ec489c4f542f8940f9c4c2181f0306a8ee1a54eceba071a449fb870a0",
+ "sha256:29a6272fec10623fcbe158fdf9abc7a5fa032048ac1d8631f14b50fbfc10d17f",
+ "sha256:2b31f46bf7b31e6aa690d4c7a3d51bb262438c6dcb0d528adde446531d0d3bb7",
+ "sha256:2d43af2be93ffbad25dd959899b5b809618a496926146ce98ee0b23683f8c51c",
+ "sha256:381ead10b9b9af5f64646cd27107fb27b614ee7040bb1226f9c07ba96625cbb5",
+ "sha256:47a11bdbd8ada9b7ee628596f9d97fbd3851bd9999d398e9436bd67376dbece7",
+ "sha256:4d6a42744139a7fa5b46a264874a781e8694bb32f1d76d8137b68138686f1729",
+ "sha256:50691e744714856f03a86df3e2bff847c2acede4c191f9a1da38f088df342978",
+ "sha256:530cc8aaf11cc2ac7430f3614b04645662ef20c348dce4167c22d99bec3480e9",
+ "sha256:582ddfbe712025448206a5bc45855d16c2e491c2dd102ee9a2841418ac1c629f",
+ "sha256:63808c30b41f3bbf65e29f7280bf793c79f54fb807057de7e5238ffc7cc4d7b9",
+ "sha256:71b69bd716698fa62cd97137d6f2fdf49f534decb23a2c6fc80813e8b7be6822",
+ "sha256:7858847f2d84bf6e64c7f66498e851c54de8ea06a6f96a32a1d192d846734418",
+ "sha256:78e93cc3571fd928a39c0b26767c986188a4118edc67bc0695bc7a284da22e82",
+ "sha256:7f43286f13d91a34fadf61ae252a51a130223c52bfefb50310d5b2deb062cf0f",
+ "sha256:86e9f8cd4b0cdd57b4ae71a9c186717daa4c5a99f3238a8723f416256e0b064d",
+ "sha256:8f264ba2701b8c9f815b272ad568d555ef98dfe1576802ab3149c3629a9f2221",
+ "sha256:9342dd70a1e151684727c9c91ea003b2fb33523bf19385d4554f7897ca0141d4",
+ "sha256:9361de40701666b034c59ad9e317bae95c973b9ff92513dd0eced11c6adf2e21",
+ "sha256:9669179786254a2e7e57f0ecf224e978471491d660aaca833f845b72a2df3709",
+ "sha256:aac1ba0a253e17889550ddb1b60a2063f7474155465577caa2a3b131224cfd54",
+ "sha256:aef72eae10b5e3116bac6957de1df4d75909fc76d1499a53fb6387434b6bcd8d",
+ "sha256:bd3166bb3b111e76a4f8e2980fa1addf2920a4ca9b2b8ca36a3bc3dedc618270",
+ "sha256:c1b78fb9700fc961f53386ad2fd86d87091e06ede5d118b8a50dea285a071c24",
+ "sha256:c3888a051226e676e383de03bf49eb633cd39fc829516e5334e69b8d81aae751",
+ "sha256:c5f17ad25d2c1286436761b462e22b5020d83316f8e8fcb5deb2b3151f8f1d3a",
+ "sha256:c851b35fc078389bc16b915a0a7c1d5923e12e2c5aeec58c52f4aa8085ac8237",
+ "sha256:cb7df71de0af56000115eafd000b867d1261f786b5eebd88a0ca6360cccfaca7",
+ "sha256:cedb2f9e1f990918ea061f28a0f0077a07702e3819602d3507e2ff98c8d20636",
+ "sha256:e8caf961e1b1a945db76f1b5fa9c91498d15f545ac0ababbe575cfab185d3bd8"
+ ],
+ "version": "==5.3"
},
"decorator": {
"hashes": [
@@ -884,19 +1172,19 @@
},
"flake8": {
"hashes": [
- "sha256:15e351d19611c887e482fb960eae4d44845013cc142d42896e9862f775d8cf5c",
- "sha256:f04b9fcbac03b0a3e58c0ab3a0ecc462e023a9faf046d57794184028123aa208"
+ "sha256:749dbbd6bfd0cf1318af27bf97a14e28e5ff548ef8e5b1566ccfb25a11e7c839",
+ "sha256:aadae8761ec651813c24be05c6f7b4680857ef6afaae4651a4eccaef97ce6c3b"
],
"index": "ia",
- "version": "==3.8.3"
+ "version": "==3.8.4"
},
"flake8-annotations": {
"hashes": [
- "sha256:9091d920406a7ff10e401e0dd1baa396d1d7d2e3d101a9beecf815f5894ad554",
- "sha256:f59fdceb8c8f380a20aed20e1ba8a57bde05935958166c52be2249f113f7ab75"
+ "sha256:0bcebb0792f1f96d617ded674dca7bf64181870bfe5dace353a1483551f8e5f1",
+ "sha256:bebd11a850f6987a943ce8cdff4159767e0f5f89b3c88aca64680c2175ee02df"
],
"index": "ia",
- "version": "==2.1.0"
+ "version": "==2.4.1"
},
"idna": {
"hashes": [
@@ -913,19 +1201,26 @@
},
"importlib-metadata": {
"hashes": [
- "sha256:0505dd08068cfec00f53a74a0ad927676d7757da81b7436a6eefe4c7cf75c545",
- "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958"
+ "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da",
+ "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3"
],
"markers": "python_version < '3.8'",
- "version": "==1.6.1"
+ "version": "==2.0.0"
+ },
+ "iniconfig": {
+ "hashes": [
+ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+ "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+ ],
+ "version": "==1.1.1"
},
"ipython": {
"hashes": [
- "sha256:0ef1433879816a960cd3ae1ae1dc82c64732ca75cec8dab5a4e29783fb571d0e",
- "sha256:1b85d65632211bf5d3e6f1406f3393c8c429a47d7b947b9a87812aa5bce6595c"
+ "sha256:c987e8178ced651532b3b1ff9965925bfd445c279239697052561a9ab806d28f",
+ "sha256:cbb2ef3d5961d44e6a963b9817d4ea4e1fa2eb589c371a470fed14d8d40cbd6a"
],
"index": "ia",
- "version": "==7.15.0"
+ "version": "==7.19.0"
},
"ipython-genutils": {
"hashes": [
@@ -936,17 +1231,17 @@
},
"isort": {
"hashes": [
- "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1",
- "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd"
+ "sha256:dcab1d98b469a12a1a624ead220584391648790275560e1a43e54c5dceae65e7",
+ "sha256:dcaeec1b5f0eca77faea2a35ab790b4f3680ff75590bfcb7145986905aab2f58"
],
- "version": "==4.3.21"
+ "version": "==5.6.4"
},
"jedi": {
"hashes": [
- "sha256:1ddb0ec78059e8e27ec9eb5098360b4ea0a3dd840bedf21415ea820c21b40a22",
- "sha256:807d5d4f96711a2bcfdd5dfa3b1ae6d09aa53832b182090b222b5efb81f52f63"
+ "sha256:86ed7d9b750603e4ba582ea8edc678657fb4007894a12bcf6f4bb97892f31d20",
+ "sha256:98cc583fa0f2f8304968199b01b6b4b94f469a1f4a74c1560506ca2a211378b5"
],
- "version": "==0.17.1"
+ "version": "==0.17.2"
},
"lazy-object-proxy": {
"hashes": [
@@ -981,32 +1276,25 @@
],
"version": "==0.6.1"
},
- "more-itertools": {
- "hashes": [
- "sha256:68c70cc7167bdf5c7c9d8f6954a7837089c6a36bf565383919bb595efb8a17e5",
- "sha256:b78134b2063dd214000685165d81c154522c3ee0a1c0d4d113c80361c234c5a2"
- ],
- "version": "==8.4.0"
- },
"mypy": {
"hashes": [
- "sha256:2c6cde8aa3426c1682d35190b59b71f661237d74b053822ea3d748e2c9578a7c",
- "sha256:3fdda71c067d3ddfb21da4b80e2686b71e9e5c72cca65fa216d207a358827f86",
- "sha256:5dd13ff1f2a97f94540fd37a49e5d255950ebcdf446fb597463a40d0df3fac8b",
- "sha256:6731603dfe0ce4352c555c6284c6db0dc935b685e9ce2e4cf220abe1e14386fd",
- "sha256:6bb93479caa6619d21d6e7160c552c1193f6952f0668cdda2f851156e85186fc",
- "sha256:81c7908b94239c4010e16642c9102bfc958ab14e36048fa77d0be3289dda76ea",
- "sha256:9c7a9a7ceb2871ba4bac1cf7217a7dd9ccd44c27c2950edbc6dc08530f32ad4e",
- "sha256:a4a2cbcfc4cbf45cd126f531dedda8485671545b43107ded25ce952aac6fb308",
- "sha256:b7fbfabdbcc78c4f6fc4712544b9b0d6bf171069c6e0e3cb82440dd10ced3406",
- "sha256:c05b9e4fb1d8a41d41dec8786c94f3b95d3c5f528298d769eb8e73d293abc48d",
- "sha256:d7df6eddb6054d21ca4d3c6249cae5578cb4602951fd2b6ee2f5510ffb098707",
- "sha256:e0b61738ab504e656d1fe4ff0c0601387a5489ca122d55390ade31f9ca0e252d",
- "sha256:eff7d4a85e9eea55afa34888dfeaccde99e7520b51f867ac28a48492c0b1130c",
- "sha256:f05644db6779387ccdb468cc47a44b4356fc2ffa9287135d05b70a98dc83b89a"
+ "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324",
+ "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc",
+ "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802",
+ "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122",
+ "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975",
+ "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7",
+ "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666",
+ "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669",
+ "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178",
+ "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01",
+ "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea",
+ "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de",
+ "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1",
+ "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c"
],
"index": "ia",
- "version": "==0.782"
+ "version": "==0.790"
},
"mypy-extensions": {
"hashes": [
@@ -1017,26 +1305,26 @@
},
"networkx": {
"hashes": [
- "sha256:cdfbf698749a5014bf2ed9db4a07a5295df1d3a53bf80bf3cbd61edf9df05fa1",
- "sha256:f8f4ff0b6f96e4f9b16af6b84622597b5334bf9cae8cf9b2e42e7985d5c95c64"
+ "sha256:7978955423fbc9639c10498878be59caf99b44dc304c2286162fd24b458c1602",
+ "sha256:8c5812e9f798d37c50570d15c4a69d5710a18d77bafc903ee9c5fba7454c616c"
],
- "version": "==2.4"
+ "version": "==2.5"
},
"ninja": {
"hashes": [
- "sha256:18bd4ebc6cef30981e966609362090a0d99aeca29a63ca83a3688305f1c35222",
- "sha256:39f9ab35f52b540777b77cc889ffed37182c7d55bec00f658f6f74bd5b1a4377",
- "sha256:3c206a4b8b896f396aeabfc0dbd99d84bc01306a3e07568d28d5536c24cbeaa3",
- "sha256:3d4b1a3fa4d68c9dc74f50875c9bfe4eaaf495b5205d12526aea95043488c8b6",
- "sha256:5ae857e0283acbf4b3645756d9e8217fddbe1f41dfe33e2c40dc79cb69706a8c",
- "sha256:607211b652a32006cda8a72a1496c348ddadcbe30986ff264e7354972fa3194e",
- "sha256:6ba8b42193600bfbde76dc32d7f6fd5675e253a9e5d7caad4a2735a84a72d491",
- "sha256:760de263a261919fc97cf1fd30d2dd8902dd89d5165d6cbf80ce3d66a39fff11",
- "sha256:9897b92c626caabe51fce04a9be851f635ed828a55c558a9cf1a75571b4c4fce",
- "sha256:ddfac074ae408e42c617cd44f90a95bf6db94f0c846c95ef2a3a9a03438027a1",
- "sha256:fa6d68b4f65aca57594d3cccfcf8fa7c8a311e93c55eed8043cabc439617d7b7"
+ "sha256:06a72090f5c5516e57f12699644179504a77585bed6d5f8be9e67219a398ec80",
+ "sha256:16fc1bea52a36a91a0e80c3b221d2c1bc9bcf04d0564da9344e349b8c5efd5c6",
+ "sha256:1d9ed3b5fdeb646516f54bec92453dcb3000d6771c2fea56451444c988a23e29",
+ "sha256:24acc95359308d11243386cf9f076bdc95f438ef6a4e0e357e7c122c5e02816d",
+ "sha256:4252ce532304841e47478bb61710fcf9940cf2c91731303490762b6e4f23fd2b",
+ "sha256:5c3a8cb54aaaf5d4f692d65121ef47b3e43dea123a6563153d9d97631c0adf4f",
+ "sha256:621fd73513a9bef0cb82e8c531a29ef96580b4d6e797f833cce167054ad812f8",
+ "sha256:99c6102ae9a8981afe4d06f92508dbeab1e28ec89783fb703411166f4e13c9ee",
+ "sha256:a1a9d9455623a3f45557fff6eb5abb3e70910dde28cfb9239e3ca14249149f55",
+ "sha256:c6059bd04ad235e2326b39bc71bb7989de8d565084b5f269557704747b2910fa",
+ "sha256:fb1ae96811a9b73773014b8a21d710b89d7d5f765427a5e2541e7fb9d530fdd5"
],
- "version": "==1.10.0.post1"
+ "version": "==1.10.0.post2"
},
"packaging": {
"hashes": [
@@ -1047,10 +1335,10 @@
},
"parso": {
"hashes": [
- "sha256:158c140fc04112dc45bca311633ae5033c2c2a7b732fa33d0955bad8152a8dd0",
- "sha256:908e9fae2144a076d72ae4e25539143d40b8e3eafbaeae03c1bfe226f4cdf12c"
+ "sha256:97218d9159b2520ff45eb78028ba8b50d2bc61dcc062a9682666f2dc4bd331ea",
+ "sha256:caba44724b994a8a5e086460bb212abc5a8bc46951bf4a9a1210745953622eb9"
],
- "version": "==0.7.0"
+ "version": "==0.7.1"
},
"pexpect": {
"hashes": [
@@ -1076,10 +1364,10 @@
},
"prompt-toolkit": {
"hashes": [
- "sha256:563d1a4140b63ff9dd587bda9557cffb2fe73650205ab6f4383092fb882e7dc8",
- "sha256:df7e9e63aea609b1da3a65641ceaf5bc7d05e0a04de5bd45d05dbeffbabf9e04"
+ "sha256:25c95d2ac813909f813c93fde734b6e44406d1477a9faef7c915ff37d39c0a8c",
+ "sha256:7debb9a521e0b1ee7d2fe96ee4bd60ef03c6492784de0547337ca4433e46aa63"
],
- "version": "==3.0.5"
+ "version": "==3.0.8"
},
"ptyprocess": {
"hashes": [
@@ -1090,10 +1378,10 @@
},
"py": {
"hashes": [
- "sha256:a673fa23d7000440cc885c17dbd34fafcb7d7a6e230b29f6766400de36a33c44",
- "sha256:f3b3a4c36512a4c4f024041ab51866f11761cc169670204b235f6b20523d4e6b"
+ "sha256:366389d1db726cd2fcfc79732e75410e5fe4d31db13692115529d34069a043c2",
+ "sha256:9ca6883ce56b4e8da7e79ac18787889fa5206c79dcc67fb065376cd2fe03f342"
],
- "version": "==1.8.2"
+ "version": "==1.9.0"
},
"pycodestyle": {
"hashes": [
@@ -1111,18 +1399,18 @@
},
"pygments": {
"hashes": [
- "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44",
- "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"
+ "sha256:381985fcc551eb9d37c52088a32914e00517e57f4a21609f48141ba08e193fa0",
+ "sha256:88a0bbcd659fcb9573703957c6b9cff9fab7295e6e76db54c9d00ae42df32773"
],
- "version": "==2.6.1"
+ "version": "==2.7.2"
},
"pylint": {
"hashes": [
- "sha256:7dd78437f2d8d019717dbf287772d0b2dbdfd13fc016aa7faa08d67bccc46adc",
- "sha256:d0ece7d223fe422088b0e8f13fa0a1e8eb745ebffcb8ed53d3e95394b6101a1c"
+ "sha256:bb4a908c9dadbc3aac18860550e870f58e1a02c9f2c204fdf5693d73be061210",
+ "sha256:bfe68f020f8a0fece830a22dd4d5dddb4ecc6137db04face4c3420a46a52239f"
],
"index": "ia",
- "version": "==2.5.3"
+ "version": "==2.6.0"
},
"pyparsing": {
"hashes": [
@@ -1133,27 +1421,27 @@
},
"pytest": {
"hashes": [
- "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1",
- "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"
+ "sha256:4288fed0d9153d9646bfcdf0c0428197dba1ecb27a33bb6e031d002fa88653fe",
+ "sha256:c0a7e94a8cdbc5422a51ccdad8e6f1024795939cc89159a0ae7f0b316ad3823e"
],
"index": "ia",
- "version": "==5.4.3"
+ "version": "==6.1.2"
},
"pytest-cov": {
"hashes": [
- "sha256:1a629dc9f48e53512fcbfda6b07de490c374b0c83c55ff7a1720b3fccff0ac87",
- "sha256:6e6d18092dce6fad667cd7020deed816f858ad3b49d5b5e2b1cc1c97a4dba65c"
+ "sha256:45ec2d5182f89a81fc3eb29e3d1ed3113b9e9a873bcddb2a71faaab066110191",
+ "sha256:47bd0ce14056fdd79f93e1713f88fad7bdcc583dcd7783da86ef2f085a0bb88e"
],
"index": "ia",
- "version": "==2.10.0"
+ "version": "==2.10.1"
},
"pytest-mock": {
"hashes": [
- "sha256:636e792f7dd9e2c80657e174c04bf7aa92672350090736d82e97e92ce8f68737",
- "sha256:a9fedba70e37acf016238bb2293f2652ce19985ceb245bbd3d7f3e4032667402"
+ "sha256:024e405ad382646318c4281948aadf6fe1135632bea9cc67366ea0c4098ef5f2",
+ "sha256:a4d6d37329e4a893e77d9ffa89e838dd2b45d5dc099984cf03c703ac8411bb82"
],
"index": "ia",
- "version": "==3.1.1"
+ "version": "==3.3.1"
},
"pytest-pylint": {
"hashes": [
@@ -1172,10 +1460,10 @@
},
"pytype": {
"hashes": [
- "sha256:08ddb9940764492b701a8985c30437239eb2c34003448cca760769264f5ff2f8"
+ "sha256:01c2dc3664b550e5c571c432035eda85c5b1ba0bc2675f50bd24f226fda25fc2"
],
"index": "ia",
- "version": "==2020.6.1"
+ "version": "==2020.11.3"
},
"pyyaml": {
"hashes": [
@@ -1203,11 +1491,11 @@
},
"responses": {
"hashes": [
- "sha256:7bb697a5fedeb41d81e8b87f152d453d5cab42dcd1691b6a7d6097e94d33f373",
- "sha256:af94d28cdfb48ded0ad82a5216616631543650f440334a693479b8991a6594a2"
+ "sha256:0de50fbf600adf5ef9f0821b85cc537acca98d66bc7776755924476775c1989c",
+ "sha256:e80d5276011a4b79ecb62c5f82ba07aa23fb31ecbc95ee7cad6de250a3c97444"
],
"index": "ia",
- "version": "==0.10.15"
+ "version": "==0.12.0"
},
"six": {
"hashes": [
@@ -1218,59 +1506,68 @@
},
"toml": {
"hashes": [
- "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f",
- "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
- "version": "==0.10.1"
+ "version": "==0.10.2"
},
"traitlets": {
"hashes": [
- "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44",
- "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7"
+ "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396",
+ "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"
],
- "version": "==4.3.3"
+ "version": "==5.0.5"
},
"typed-ast": {
"hashes": [
"sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
"sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+ "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
"sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
"sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
"sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+ "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
"sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
"sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
"sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
"sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
"sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
"sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+ "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
"sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
"sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+ "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
"sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+ "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
"sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
"sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
"sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
"sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
"sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+ "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
+ "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
+ "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
"sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+ "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
"sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
],
"version": "==1.4.1"
},
"typing-extensions": {
"hashes": [
- "sha256:6e95524d8a547a91e08f404ae485bbb71962de46967e1b71a0cb89af24e761c5",
- "sha256:79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae",
- "sha256:f8d2bd89d25bc39dabe7d23df520442fa1d8969b82544370e03d88b5a591c392"
+ "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
+ "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
+ "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
],
- "version": "==3.7.4.2"
+ "version": "==3.7.4.3"
},
"urllib3": {
"hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2",
+ "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"
],
"markers": "python_version != '3.4'",
- "version": "==1.22"
+ "version": "==1.25.11"
},
"wcwidth": {
"hashes": [
@@ -1287,10 +1584,10 @@
},
"zipp": {
"hashes": [
- "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b",
- "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"
+ "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
+ "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
],
- "version": "==3.1.0"
+ "version": "==3.4.0"
}
}
}
diff --git a/python/example.env b/python/example.env
index 4d3baa0..5064c96 100644
--- a/python/example.env
+++ b/python/example.env
@@ -1,5 +1,5 @@
-MINIO_ACCESS_KEY="minioadmin"
-MINIO_SECRET_KEY="minioadmin"
+SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin"
+SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"
IA_ACCESS_KEY="dummy"
IA_SECRET_KEY="dummy"
CDX_AUTH_TOKEN="dummy"
diff --git a/python/grobid2json.py b/python/grobid2json.py
index 0eae6fe..a22d47d 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -59,7 +59,7 @@ def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
addr_e = ae.find("./{%s}address" % ns)
if addr_e:
address = dict()
- for t in addr_e.getchildren():
+ for t in addr_e:
address[t.tag.split("}")[-1]] = t.text
if address:
affiliation["address"] = address
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index fe507a0..2a1d8b5 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -133,7 +133,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.sink = None
diff --git a/python/ingest_file.py b/python/ingest_file.py
index f6f694e..ad78f50 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -78,7 +78,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!", file=sys.stderr)
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.func(args)
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 0d33ec9..10a0f48 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -115,7 +115,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!", file=sys.stderr)
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.text_sink = None
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index ec92afe..5cffa8c 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -105,7 +105,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.sink = None
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 66e02aa..69e9374 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
-Commands for backfilling content from bulk files into postgresql and s3 (minio).
+Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
@@ -120,16 +120,16 @@ def main():
help="postgresql database connection string",
default="postgres:///sandcrawler")
parser.add_argument('--s3-url',
- help="S3 (minio) backend URL",
+ help="S3 (seaweedfs) backend URL",
default="localhost:9000")
parser.add_argument('--s3-access-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_ACCESS_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
parser.add_argument('--s3-secret-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_SECRET_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
parser.add_argument('--s3-bucket',
- help="S3 (minio) bucket to persist into",
+ help="S3 (seaweedfs) bucket to persist into",
default="sandcrawler-dev")
subparsers = parser.add_subparsers()
@@ -144,7 +144,7 @@ def main():
help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)")
+ help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_grobid.set_defaults(func=run_grobid)
sub_grobid.add_argument('json_file',
help="grobid file to import from (or '-' for stdin)",
@@ -180,7 +180,7 @@ def main():
type=str)
sub_pdftrio = subparsers.add_parser('pdftrio',
- help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)")
+ help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_pdftrio.set_defaults(func=run_pdftrio)
sub_pdftrio.add_argument('json_file',
help="pdftrio file to import from (or '-' for stdin)",
diff --git a/python/pytest.ini b/python/pytest.ini
index 65f81da..034a68e 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -1,7 +1,5 @@
[pytest]
-ignore = setup.py
-
# allow imports from files in current directory
python_paths = .
@@ -18,5 +16,6 @@ filterwarnings =
ignore::DeprecationWarning:.*urllib3
ignore::DeprecationWarning:.*wayback
ignore::DeprecationWarning:.*PIL
+ ignore::DeprecationWarning:.*justext
log_level = INFO
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 793f1c4..066e53b 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,6 +1,7 @@
import json
import datetime
+from typing import Optional
import psycopg2
import psycopg2.extras
@@ -43,6 +44,18 @@ class SandcrawlerPostgrestClient:
else:
return None
+ def get_html_meta(self, sha1hex: str) -> Optional[dict]:
+ resp = requests.get(
+ self.api_url + "/html_meta",
+ params=dict(sha1hex=f"eq.{sha1hex}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
def get_file_meta(self, sha1):
resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
resp.raise_for_status()
@@ -52,12 +65,15 @@ class SandcrawlerPostgrestClient:
else:
return None
- def get_ingest_file_result(self, url):
- resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url))
+ def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = requests.get(
+ self.api_url + "/ingest_file_result",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
@@ -232,6 +248,41 @@ class SandcrawlerPostgresClient:
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
+ def insert_html_meta(self, cur, batch, on_conflict="nothing"):
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ scope=EXCLUDED.scope,
+ has_teixml=EXCLUDED.has_teixml,
+ has_thumbnail=EXCLUDED.has_thumbnail,
+ word_count=EXCLUDED.word_count,
+ biblio=EXCLUDED.biblio,
+ resources=EXCLUDED.resources
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ batch = [d.to_sql_tuple() for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
def insert_pdftrio(self, cur, batch, on_conflict="nothing"):
sql = """
INSERT INTO
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
new file mode 100644
index 0000000..f2819c2
--- /dev/null
+++ b/python/sandcrawler/html_ingest.py
@@ -0,0 +1,337 @@
+
+import io
+import sys
+import json
+import datetime
+import argparse
+import xml.etree.ElementTree as ET
+from typing import List, Optional, Any, Tuple
+
+import trafilatura
+import pydantic
+from selectolax.parser import HTMLParser
+
+from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding
+from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+
+
+TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
+def html_extract_body_teixml(doc: bytes) -> dict:
+ tei_xml = trafilatura.extract(doc,
+ tei_output=True,
+ include_comments=False,
+ include_formatting=True,
+ )
+ if tei_xml:
+ body_txt = teixml_body_text(tei_xml)
+ word_count = len(body_txt.split())
+ return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count)
+ elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+ # hack for firstmonday.org
+ return html_extract_body_teixml(doc[106:])
+ else:
+ return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
+
+def teixml_body_text(doc_xml: str) -> str:
+ ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+ tree = ET.fromstring(doc_xml)
+ body = tree.find('.//tei:body', ns)
+ if body:
+ return " ".join(body.itertext())
+ else:
+ return ""
+
+class WebResource(pydantic.BaseModel):
+ surt: str
+ timestamp: datetime.datetime
+ url: str
+ sha1hex: str
+ mimetype: str
+ status_code: int
+ size: Optional[int]
+ sha256hex: Optional[str]
+ resource_type: Optional[str]
+
+ class Config:
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat()
+ }
+
+class IngestWebResult(pydantic.BaseModel):
+ status: str
+ hit: bool
+ error_message: Optional[str]
+ cdx: Optional[dict]
+ terminal: Optional[Any] # TODO
+ request: Optional[Any] # TODO
+ file_meta: Optional[dict]
+ html_biblio: Optional[BiblioMetadata]
+ scope: Optional[str]
+ html_body: Optional[dict]
+ html_resources: Optional[List[WebResource]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+class HtmlMetaRow(pydantic.BaseModel):
+ sha1hex: str
+ status: str
+ scope: Optional[str]
+ has_teixml: bool
+ has_thumbnail: bool
+ word_count: Optional[int]
+ biblio: Optional[dict]
+ resources: Optional[List[dict]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+ def to_sql_tuple(self) -> Tuple:
+ """
+ This is for the html_meta SQL table.
+ """
+ return (
+ self.sha1hex,
+ datetime.datetime.now(), # updated
+ self.status,
+ self.scope,
+ self.has_teixml,
+ self.has_thumbnail,
+ self.word_count,
+ (self.biblio or None) and json.dumps(self.biblio, sort_keys=True),
+ (self.resources or None) and json.dumps(self.resources, sort_keys=True),
+ )
+
+
+def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+ """
+ This is the lazy version that just does a CDX lookup for each resource.
+
+ Takes a list instead of single record because we may want to circuit break
+ on failure, and may introduce concurrency internal to this function.
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
+ if not cdx_row:
+ raise Exception("CDX lookup failed")
+ if cdx_row.url != resource['url']:
+ pass
+ #raise Exception(
+ # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}")
+ full.append(WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource['type'],
+ ))
+
+ return full
+
+
+def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+ """
+ This is the full version which fetches each resource from wayback/petabox
+ and calculates additional hashes.
+
+ Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
+ if not wayback_resp or wayback_resp.status != 'success':
+ # TODO: raise a specific exception so we can catch it elsewhere?
+ raise Exception("wayback lookup failed")
+ file_meta = gen_file_metadata(wayback_resp.body)
+ if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
+ raise Exception("wayback payload sha1hex mismatch")
+ full.append(WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta['sha1hex'],
+ mimetype=file_meta['mimetype'],
+ status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code,
+ size=file_meta['size_bytes'],
+ sha256hex=file_meta['sha256hex'],
+ resource_type=resource['type'],
+ ))
+
+ return full
+
+
+def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
+ """
+ This function tries to guess if an HTML document represents one of:
+
+ - article-fulltext
+ - article-abstract
+ - article-sample
+ - supplement
+ - component
+ - issue-fulltext
+ - landingpage
+ - paywall
+ - loginwall
+ - blockpage
+ - errorpage
+ - stub
+ - unknown
+ """
+
+ # basic paywall and loginwall detection based on URL
+ if url.endswith("/cookieAbsent"):
+ return "blockpage"
+ if "://page-one.live.cf.public.springer.com" in url:
+ return "article-sample"
+
+ if "scielo" in url:
+ if "sci_abstract" in url:
+ return "landingpage"
+ if "sci_arttext" in url:
+ return "article-fulltext"
+
+ if biblio and biblio.html_fulltext_url == url:
+ return "article-fulltext"
+
+ # fallback: guess based word count (arbitrary guesses here)
+ if word_count == None:
+ return "unknown"
+ #print(f" body text word count: {word_count}", file=sys.stderr)
+ assert word_count is not None
+ if word_count < 20:
+ return "stub"
+ elif word_count > 800:
+ return "article-fulltext"
+
+ return "unknown"
+
+
+def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
+
+ adblock = load_adblock_rules()
+ wayback_client = WaybackClient()
+
+ html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp)
+ if html_resource.status != "success":
+ return IngestWebResult(
+ status=html_resource.status,
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ )
+
+ assert html_resource.terminal_status_code == 200
+
+ file_meta = gen_file_metadata(html_resource.body)
+ file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
+
+ if file_meta['mimetype'] not in ("text/html", "text/xml"):
+ return IngestWebResult(
+ status="wrong-mimetype",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ )
+
+ html_doc = HTMLParser(html_resource.body)
+ html_biblio = html_extract_biblio(url, html_doc)
+ html_body = html_extract_body_teixml(html_resource.body)
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count'))
+ if html_scope not in ('article-fulltext', 'unknown'):
+ return IngestWebResult(
+ status="wrong-scope",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ )
+
+ raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+ assert len(raw_resources) <= 200
+
+ when = parse_cdx_datetime(html_resource.cdx.datetime)
+
+ full_resources: List[WebResource] = []
+ if quick_mode:
+ full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+ else:
+ full_resources = fetch_html_resources(raw_resources, wayback_client, when)
+
+ output = IngestWebResult(
+ status="success",
+ hit=True,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_body=html_body,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ html_resources=full_resources,
+ )
+ return output
+
+
+def main() -> None:
+ """
+ Run this command like:
+
+ python -m sandcrawler.html_ingest
+ """
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ subparsers = parser.add_subparsers()
+
+ sub = subparsers.add_parser(
+ "single", help="tries to ingest a single URL, dumps result to stdout"
+ )
+ sub.set_defaults(func="run_single")
+ sub.add_argument(
+ "url",
+ help="URL to fetch",
+ type=str,
+ )
+ sub.add_argument(
+ "--timestamp",
+ help="timestamp for which to fetch document from wayback",
+ type=str,
+ )
+ sub.add_argument(
+ "--quick-mode",
+ help="don't fetch resources, only do CDX lookup",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ if args.func == "run_single":
+ result = run_single(args.url, args.timestamp, args.quick_mode)
+ print(result.json(indent=2, exclude_none=True))
+ else:
+ #func = getattr(wp, args.func)
+ #func()
+ raise NotImplementedError()
+
+if __name__ == "__main__":
+ main()
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
new file mode 100644
index 0000000..0d14166
--- /dev/null
+++ b/python/sandcrawler/html_metadata.py
@@ -0,0 +1,452 @@
+
+import datetime
+from typing import List, Optional, Any, Tuple, Dict
+import urllib.parse
+
+import dateparser
+from selectolax.parser import HTMLParser
+import pydantic
+import braveblock
+
+
+# this is a map of metadata keys to CSS selectors
+# sources for this list include:
+# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
+# - inspection of actual publisher HTML
+# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata
+# - "HTML meta tags used by journal articles"
+# https://gist.github.com/hubgit/5985963
+# order of these are mostly by preference/quality (best option first), though
+# also/sometimes re-ordered for lookup efficiency (lookup stops after first
+# match)
+HEAD_META_PATTERNS: Any = {
+ "title": [
+ "meta[name='citation_title']",
+ "meta[name='eprints.title']",
+ "meta[name='prism.title']",
+ "meta[name='bepress_citation_title']",
+ "meta[name='og:title']",
+ "meta[name='dcterms.title']",
+ "meta[name='dc.title']",
+ ],
+ "subtitle": [
+ "meta[name='prism.subtitle']",
+ ],
+ "doi": [
+ "meta[name='citation_doi']",
+ "meta[name='DOI']",
+ "meta[id='DOI']",
+ "meta[name='prism.doi']",
+ "meta[name='bepress_citation_doi']",
+ "meta[name='dc.identifier.doi']",
+ "meta[name='dc.identifier'][scheme='doi']",
+ ],
+ "pmid": [
+ "meta[name='citation_pmid']",
+ ],
+ "abstract": [
+ "meta[name='citation_abstract']",
+ "meta[name='bepress_citation_abstract']",
+ "meta[name='eprints.abstract']",
+ "meta[name='dcterms.abstract']",
+ "meta[name='prism.teaser']",
+ "meta[name='dc.description']",
+ "meta[name='og:description']",
+ ],
+ "container_name": [
+ "meta[name='citation_journal_title']",
+ "meta[name='bepress_citation_journal_title']",
+ "meta[name='citation_conference_title']",
+ "meta[name='bepress_citation_conference_title']",
+ "meta[name='prism.publicationName']",
+ "meta[name='eprints.publication']",
+ "meta[name='dc.relation.ispartof']",
+ "meta[name='dc.source']",
+ "meta[property='og:site_name']",
+ ],
+ "container_abbrev": [
+ "meta[name='citation_journal_abbrev']",
+ ],
+ "raw_date": [
+ "meta[name='citation_publication_date']",
+ "meta[name='bepress_citation_publication_date']",
+ "meta[name='prism.publicationDate']",
+ "meta[name='citation_date']",
+ "meta[name='bepress_citation_date']",
+ "meta[name='citation_online_date']",
+ "meta[name='bepress_citation_online_date']",
+ "meta[itemprop='datePublished']",
+ "meta[name='article:published']",
+ "meta[name='eprints.datestamp']",
+ "meta[name='eprints.date']",
+ "meta[name='dc.date.created']",
+ "meta[name='dc.issued']",
+ "meta[name='dcterms.date']",
+ "meta[name='dc.date']",
+ ],
+ "release_year": [
+ "meta[itemprop='citation_year']",
+ "meta[itemprop='prism:copyrightYear']",
+ ],
+ "first_page": [
+ "meta[name='citation_firstpage']",
+ "meta[name='bepress_citation_firstpage']",
+ "meta[name='prism.startingPage']",
+ "meta[name='dc.citation.spage']",
+ ],
+ "last_page": [
+ "meta[name='citation_lastpage']",
+ "meta[name='bepress_citation_lastpage']",
+ "meta[name='prism.endingPage']",
+ "meta[name='dc.citation.epage']",
+ ],
+ "issue": [
+ "meta[name='citation_issue']",
+ "meta[name='bepress_citation_issue']",
+ "meta[name='prism.issueIdentifier']",
+ "meta[name='dc.citation.issue']",
+ ],
+ "volume": [
+ "meta[name='citation_volume']",
+ "meta[name='bepress_citation_volume']",
+ "meta[name='prism.volume']",
+ "meta[name='dc.citation.volume']",
+ ],
+ "number": [
+ "meta[name='citation_technical_report_number']",
+ "meta[name='bepress_citation_technical_report_number']",
+ "meta[name='citation_number']",
+ "meta[name='bepress_citation_number']",
+ "meta[name='prism.number']",
+ ],
+ "container_issn": [
+ "meta[name='citation_issn']",
+ "meta[name='bepress_citation_issn']",
+ "meta[name='prism.issn']",
+ "meta[name='prism.eIssn']",
+ "meta[name='eprints.issn']",
+ "meta[name='dc.source.issn']",
+ ],
+ "isbn": [
+ "meta[name='citation_isbn']",
+ "meta[name='bepress_citation_isbn']",
+ "meta[name='prism.isbn']",
+ ],
+ "publisher": [
+ "meta[name='citation_publisher']",
+ "meta[name='bepress_citation_publisher']",
+ "meta[name='eprints.publisher']",
+ "meta[name='citation_technical_report_institution']",
+ "meta[name='dcterms.publisher']",
+ "meta[name='dc.publisher']",
+ ],
+ "raw_release_type": [
+ "meta[name='citation_article_type']",
+ "meta[name='bepress_citation_article_type']",
+ "meta[name='prism.contentType']",
+ "meta[name='eprints.type']",
+ "meta[name='dc.type']",
+ ],
+ "lang": [
+ "meta[name='citation_language']",
+ "meta[name='bepress_citation_language']",
+ "meta[name='dcterms.language']",
+ "meta[name='dc.language']",
+ "meta[name='og:locale']",
+ ],
+}
+
+HEAD_META_LIST_PATTERNS: Any = {
+ "contrib_names": [
+ "meta[name='citation_author']",
+ "meta[name='bepress_citation_author']",
+ "meta[name='eprints.creators_name']",
+ "meta[name='dcterms.creator']",
+ "meta[name='article:author']",
+ "meta[name='dc.creator']",
+ "meta[name='dc.contributor']",
+ ],
+ # TODO: citation_author_institution
+ "raw_references": [
+ "meta[name='citation_reference']",
+ ],
+ "raw_identifiers": [
+ "meta[name='eprints.id_number']",
+ "meta[name='dcterms.identifier']",
+ "meta[name='dc.identifier']",
+ ],
+}
+
+XML_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "meta[name='citation_xml_url']",
+ "attr": "content",
+ "technique": "citation_xml_url",
+ },
+ {
+ "selector": "link[rel='alternate'][type='application/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "scielo",
+ "in_fulltext_url": "articleXML",
+ "selector": "a[target='xml']",
+ "attr": "href",
+ "technique": "SciElo XML link",
+ },
+]
+
+HTML_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "meta[name='citation_fulltext_html_url']",
+ "attr": "content",
+ "technique": "citation_fulltext_html_url",
+ },
+]
+
+PDF_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "meta[name='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "selector": "meta[name='bepress_citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+]
+
+RELEASE_TYPE_MAP = {
+ "research article": "article-journal",
+ "text.serial.journal": "article-journal",
+}
+
+
+class BiblioMetadata(pydantic.BaseModel):
+ title: Optional[str]
+ subtitle: Optional[str]
+ contrib_names: Optional[List[str]]
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+ lang: Optional[str]
+ country_code: Optional[str]
+ volume: Optional[str]
+ issue: Optional[str]
+ number: Optional[str]
+ pages: Optional[str]
+ first_page: Optional[str]
+ last_page: Optional[str]
+ license: Optional[str]
+ publisher: Optional[str]
+ container_name: Optional[str]
+ container_abbrev: Optional[str]
+ container_issn: Optional[str]
+ container_type: Optional[str]
+ raw_references: Optional[List[str]]
+
+ doi: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ publisher_ident: Optional[str]
+ oai_id: Optional[str]
+
+ abstract: Optional[str]
+ pdf_fulltext_url: Optional[str]
+ html_fulltext_url: Optional[str]
+ xml_fulltext_url: Optional[str]
+
+ class Config:
+ json_encoders = {
+ datetime.date: lambda dt: dt.isoformat()
+ }
+
+
+def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+ """
+ Tries to quickly extract fulltext URLs using a set of patterns. This
+ function is intendend to be generic across various extraction techniques.
+
+ Returns null or a tuple of (url, technique)
+ """
+ for pattern in patterns:
+ if not 'selector' in pattern:
+ continue
+ if 'in_doc_url' in pattern:
+ if not pattern['in_doc_url'] in doc_url:
+ continue
+ elem = doc.css_first(pattern['selector'])
+ if not elem:
+ continue
+ if 'attr' in pattern:
+ val = elem.attrs[pattern['attr']]
+ if val:
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ if 'in_fulltext_url' in pattern:
+ if not pattern['in_fulltext_url'] in val:
+ continue
+ return (val, pattern.get('technique', 'unknown'))
+ return None
+
+def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
+
+ meta: Any = dict()
+ head = doc.css_first("head")
+ if not head:
+ return None
+
+ for field, patterns in HEAD_META_PATTERNS.items():
+ for pattern in patterns:
+ val = head.css_first(pattern)
+ #print((field, pattern, val))
+ if val and val.attrs['content']:
+ meta[field] = val.attrs['content']
+ break
+
+ for field, patterns in HEAD_META_LIST_PATTERNS.items():
+ for pattern in patterns:
+ val_list = head.css(pattern)
+ if val_list:
+ for val in val_list:
+ if val.attrs['content']:
+ if not field in meta:
+ meta[field] = []
+ meta[field].append(val.attrs['content'])
+ break
+
+ # (some) fulltext extractions
+ pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
+ if pdf_fulltext_url:
+ meta['pdf_fulltext_url'] = pdf_fulltext_url[0]
+ xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
+ if xml_fulltext_url:
+ meta['xml_fulltext_url'] = xml_fulltext_url[0]
+ html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
+ if html_fulltext_url:
+ meta['html_fulltext_url'] = html_fulltext_url[0]
+
+ # TODO: replace with clean_doi() et al
+ if meta.get('doi') and meta.get('doi').startswith('doi:'):
+ meta['doi'] = meta['doi'][4:]
+
+ raw_identifiers = meta.pop('raw_identifiers', [])
+ for ident in raw_identifiers:
+ if ident.startswith('doi:10.'):
+ if not 'doi' in meta:
+ meta['doi'] = ident.replace('doi:', '')
+ elif ident.startswith('10.') and '/' in ident:
+ if not 'doi' in meta:
+ meta['doi'] = ident
+ elif ident.startswith('isbn:'):
+ if not 'isbn' in meta:
+ meta['isbn'] = ident.replace('isbn:', '')
+
+ raw_date = meta.pop('raw_date', None)
+ if raw_date:
+ parsed = dateparser.parse(raw_date)
+ if parsed:
+ meta['release_date'] = parsed.date()
+
+ raw_release_type = meta.pop('raw_release_type', None)
+ if raw_release_type:
+ release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
+ if release_type:
+ meta['release_type'] = release_type
+
+ return BiblioMetadata(**meta)
+
+def load_adblock_rules() -> braveblock.Adblocker:
+ """
+ TODO: consider blocking very generic assets:
+ - ://fonts.googleapis.com/css*
+ - ://journals.plos.org/plosone/resource/img/icon.*
+ """
+ return braveblock.Adblocker(
+ include_easylist=True,
+ include_easyprivacy=True,
+ rules=[
+ "/favicon.ico^",
+ "||fonts.googleapis.com^",
+ "||widgets.figshare.com^",
+ "||crossmark-cdn.crossref.org^",
+ "||platform.twitter.com^",
+ "||verify.nature.com^",
+ "||s7.addthis.com^",
+ "||www.mendeley.com^",
+ "||pbs.twimg.com^",
+ "||badge.dimensions.ai^",
+
+ # not sure about these CC badges (usually via a redirect)
+ #"||licensebuttons.net^",
+ #"||i.creativecommons.org^",
+
+ # Should we skip jquery, or other generic javascript CDNs?
+ #"||code.jquery.com^",
+ #"||ajax.googleapis.com^",
+ #"||cdnjs.cloudflare.com^",
+
+ # badges, "share" buttons, etc
+ "apis.google.com/js/plusone",
+
+ # PLOS images
+ "/resource/img/icon.*.16.png^",
+ ],
+ )
+
+
+def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+ resources = []
+
+ for node in doc.css(selector):
+ for attr in attrs:
+ url = node.attrs.get(attr)
+ if url:
+ resources.append(dict(url=url, type=type_name))
+
+ return resources
+
+
+def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list:
+ """
+ This function tries to find all the important resources in a page. The
+ presumption is that the HTML document is article fulltext, and we want the
+ list of all resoures (by URL) necessary to replay the page.
+
+ The returned resource URLs each have a type (script, img, css, etc), and
+ should be fully-qualified URLs (not relative).
+
+ Adblock filtering is run to remove unwanted resources.
+ """
+ resources = []
+
+ # select various resource references
+ resources += _extract_generic(doc, "script", ["src"], "script")
+ resources += _extract_generic(doc, "link[rel='stylesheet']", ["href"], "stylesheet")
+ # TODO: srcset and parse
+ # eg: https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w
+ resources += _extract_generic(doc, "img", ["src"], "image")
+ resources += _extract_generic(doc, "audio", ["src"], "audio")
+ resources += _extract_generic(doc, "video", ["src"], "media")
+ resources += _extract_generic(doc, "source", ["src"], "media")
+ resources += _extract_generic(doc, "track", ["src"], "media")
+ resources += _extract_generic(doc, "iframe", ["src"], "subdocument")
+ resources += _extract_generic(doc, "embed", ["src"], "media")
+
+ # ensure URLs are absolute
+ for r in resources:
+ r['url'] = urllib.parse.urljoin(doc_url, r['url'])
+
+ # filter using adblocker
+ resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False]
+
+ # remove duplicates
+ resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
+
+ return resources
+
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e6c6295..0b58f3b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,10 +3,14 @@
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os, sys, time
+import os
+import sys
+import time
+import gzip
import json
import requests
import datetime
+from typing import Tuple
from collections import namedtuple
import http.client
@@ -17,7 +21,7 @@ http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
+from gwb.loader import CDXLoaderFactory3
from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
@@ -232,7 +236,7 @@ class CdxApiClient:
assert row.status_code == filter_status_code
return row
- def lookup_best(self, url, max_age_days=None, best_mimetype=None):
+ def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None):
"""
Fetches multiple CDX rows for the given URL, tries to find the most recent.
@@ -270,6 +274,10 @@ class CdxApiClient:
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
+ if closest:
+ params['closest'] = closest
+ params['sort'] = "closest"
+ #print(params, file=sys.stderr)
rows = self._query_api(params)
if not rows:
return None
@@ -352,14 +360,14 @@ class WaybackClient:
raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3(
webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ ))
try:
#print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
- print("Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
except ValueError as ve:
raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
@@ -398,8 +406,11 @@ class WaybackClient:
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)
- revisit_uri = revisit_uri.decode('utf-8')
- revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ if type(revisit_uri) is bytes:
+ revisit_uri = revisit_uri.decode('utf-8')
+ if type(revisit_dt) is bytes:
+ revisit_dt = revisit_dt.decode('utf-8')
+ revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
assert len(revisit_dt) == 14
try:
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
@@ -507,7 +518,7 @@ class WaybackClient:
# TODO: don't need *all* these hashes, just sha1
file_meta = gen_file_metadata(resp.content)
if cdx_sha1hex != file_meta['sha1hex']:
- print("REPLAY MISMATCH: cdx:{} replay:{}".format(
+ print(" REPLAY MISMATCH: cdx:{} replay:{}".format(
cdx_sha1hex,
file_meta['sha1hex']),
file=sys.stderr)
@@ -568,7 +579,7 @@ class WaybackClient:
else:
return None
- def lookup_resource(self, start_url, best_mimetype=None):
+ def lookup_resource(self, start_url, best_mimetype=None, closest=None):
"""
Looks in wayback for a resource starting at the URL, following any
redirects. Returns a ResourceResult object, which may indicate a
@@ -596,7 +607,7 @@ class WaybackClient:
urls_seen = [start_url]
for i in range(self.max_redirects):
print(" URL: {}".format(next_url), file=sys.stderr)
- cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype)
+ cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest)
#print(cdx_row, file=sys.stderr)
if not cdx_row:
return ResourceResult(
@@ -668,7 +679,7 @@ class WaybackClient:
)
assert 300 <= resource.status_code < 400
if not resource.location:
- print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -697,7 +708,7 @@ class WaybackClient:
next_url = clean_url(next_url)
cdx_row = cdx_partial_from_row(cdx_row)
if not next_url:
- print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -980,10 +991,10 @@ class SavePageNowClient:
best_mimetype="application/pdf",
)
if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf":
- print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ print(" Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
cdx_row = elsevier_pdf_cdx
else:
- print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
#print(elsevier_pdf_cdx, file=sys.stderr)
if not cdx_row:
@@ -999,7 +1010,7 @@ class SavePageNowClient:
retry_sleep=9.0,
)
except KeyError as ke:
- print("CDX KeyError: {}".format(ke), file=sys.stderr)
+ print(" CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -1060,3 +1071,24 @@ class SavePageNowClient:
revisit_cdx=revisit_cdx,
)
+
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
+ if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+ print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+ inner_body = gzip.decompress(resource.body)
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 6d8b162..0c8eee6 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -5,16 +5,25 @@ import gzip
import time
import base64
import requests
+from typing import Optional, Tuple, Any, Dict, List
from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple
+from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding
from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url
+from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_ingest import fetch_html_resources, \
+ quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
+ WebResource
+from sandcrawler.html_metadata import html_extract_fulltext_url, \
+ XML_FULLTEXT_PATTERNS, HTML_FULLTEXT_PATTERNS, BiblioMetadata, \
+ html_extract_resources, html_extract_biblio, load_adblock_rules
from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.xml import xml_reserialize
class IngestFileWorker(SandcrawlerWorker):
@@ -46,7 +55,7 @@ class IngestFileWorker(SandcrawlerWorker):
def __init__(self, sink=None, **kwargs):
super().__init__()
-
+
self.sink = sink
self.wayback_client = kwargs.get('wayback_client')
if not self.wayback_client:
@@ -63,12 +72,18 @@ class IngestFileWorker(SandcrawlerWorker):
self.grobid_sink = kwargs.get('grobid_sink')
self.thumbnail_sink = kwargs.get('thumbnail_sink')
self.pdftext_sink = kwargs.get('pdftext_sink')
+ self.xmldoc_sink = kwargs.get('xmldoc_sink')
+ self.htmlteixml_sink = kwargs.get('htmlteixml_sink')
+ self.max_hops = 6
self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
self.try_existing_grobid = kwargs.get('try_existing_grobid', True)
self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True)
self.try_wayback = kwargs.get('try_wayback', True)
self.try_spn2 = kwargs.get('try_spn2', True)
+ self.html_quick_mode = False
+ self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
self.base_url_blocklist = [
# robot blocking
@@ -76,8 +91,10 @@ class IngestFileWorker(SandcrawlerWorker):
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
+ "://www.archive.org/",
"://web.archive.org/web/",
"://openlibrary.org/",
+ "://www.openlibrary.org/",
"://fatcat.wiki/",
# Domain squats
@@ -135,7 +152,7 @@ class IngestFileWorker(SandcrawlerWorker):
]
- def check_existing_ingest(self, base_url):
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
"""
Check in sandcrawler-db (postgres) to see if we have already ingested
this URL (ingest file result table).
@@ -147,14 +164,14 @@ class IngestFileWorker(SandcrawlerWorker):
"""
if not self.try_existing_ingest:
return None
- existing = self.pgrest_client.get_ingest_file_result(base_url)
+ existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
# TODO: filter on more flags?
if existing and existing['hit'] == True:
return existing
else:
return None
- def find_resource(self, url, best_mimetype=None, force_recrawl=False):
+ def find_resource(self, url, best_mimetype=None, force_recrawl=False) -> Optional[ResourceResult]:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. If a hit isn't found, try crawling with SPN.
@@ -183,7 +200,7 @@ class IngestFileWorker(SandcrawlerWorker):
if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
old_failure = True
- if self.try_spn2 and (resource == None or (resource.status == 'no-capture') or soft404 or old_failure):
+ if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
via = "spn2"
force_simple_get = 0
for domain in self.spn2_simple_get_domains:
@@ -191,14 +208,14 @@ class IngestFileWorker(SandcrawlerWorker):
force_simple_get = 1
break
resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
- print("[FETCH {}\t] {}\t{}".format(
+ print("[FETCH {:>6}] {} {}".format(
via,
- resource.status,
- resource.terminal_url or url),
+ (resource and resource.status),
+ (resource and resource.terminal_url) or url),
file=sys.stderr)
return resource
- def process_existing(self, request, result_row):
+ def process_existing(self, request: dict, result_row: dict) -> dict:
"""
If we have an existing ingest file result, do any database fetches or
additional processing necessary to return a result.
@@ -226,16 +243,25 @@ class IngestFileWorker(SandcrawlerWorker):
}
return result
- def process_hit(self, resource, file_meta):
+ def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
"""
Run all the necessary processing for a new/fresh ingest hit.
"""
- return {
- 'grobid': self.process_grobid(resource, file_meta),
- 'pdf_meta': self.process_pdfextract(resource, file_meta),
- }
+ if ingest_type == "pdf":
+ return {
+ 'grobid': self.process_grobid(resource, file_meta),
+ 'pdf_meta': self.process_pdfextract(resource, file_meta),
+ }
+ elif ingest_type == "xml":
+ return {
+ 'xml_meta': self.process_xml(resource, file_meta),
+ }
+ elif ingest_type == "html":
+ return self.process_html(resource, file_meta)
+ else:
+ raise NotImplementedError(f"process {ingest_type} hit")
- def process_grobid(self, resource, file_meta):
+ def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
"""
Submits to resource body to GROBID for processing.
@@ -266,7 +292,7 @@ class IngestFileWorker(SandcrawlerWorker):
result.pop('key', None)
return result
- def process_pdfextract(self, resource, file_meta):
+ def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
"""
Extracts thumbnail and pdf_meta info from PDF.
@@ -288,13 +314,99 @@ class IngestFileWorker(SandcrawlerWorker):
if self.thumbnail_sink and result.page0_thumbnail is not None:
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
if self.pdftext_sink:
- self.pdftext_sink.push_record(result.to_pdftext_dict())
+ self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
result.page0_thumbnail = None
result.text = None
result.file_meta = None
return result.to_pdftext_dict()
- def timeout_response(self, task):
+ def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Simply publishes to Kafka topic.
+
+ In the future, could extract other metadata here (like body word
+ count), or attempting to fetch sub-resources.
+ """
+ if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml":
+ jats_xml = xml_reserialize(resource.body)
+ msg = dict(
+ sha1hex=file_meta["sha1hex"],
+ status="success",
+ jats_xml=jats_xml,
+ )
+ self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex'])
+ return dict(status="success")
+
+ def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
+
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
+ html_body = html_extract_body_teixml(resource.body)
+ html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
+
+ if html_scope not in ('article-fulltext', 'unknown'):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="html-body-wrong-scope",
+ html_biblio=html_biblio_dict,
+ html_scope=html_scope,
+ )
+
+ raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules)
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio_dict,
+ html_scope=html_scope,
+ )
+
+ when = parse_cdx_datetime(resource.cdx.datetime)
+
+ full_resources: List[WebResource] = []
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ html_scope=html_scope,
+ )
+
+ try:
+ if self.html_quick_mode:
+ full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result['status'] = 'petabox-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result['status'] = 'cdx-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result['status'] = 'wayback-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result['status'] = 'wayback-content-error'
+ partial_result['error_message'] = str(e)[:1600]
+ return partial_result
+
+ if self.htmlteixml_sink and html_body['status'] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])
+
+ html_body.pop("tei_xml", None)
+
+ return dict(
+ html_body=html_body,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
+ )
+
+ def timeout_response(self, task: dict) -> dict:
print("[TIMEOUT]", file=sys.stderr)
return dict(
request=task,
@@ -303,22 +415,20 @@ class IngestFileWorker(SandcrawlerWorker):
error_message="ingest worker internal timeout",
)
- def want(self, request):
- if not request.get('ingest_type') in ('file', 'pdf'):
+ def want(self, request: dict) -> bool:
+ if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html'):
return False
return True
- def process(self, request, key=None):
+ def process(self, request: dict, key: Any = None) -> dict:
- # backwards compatibility
- if request.get('ingest_type') in ('file', None):
+ # old backwards compatibility
+ if request.get('ingest_type') == 'file':
request['ingest_type'] = 'pdf'
- # for now, only pdf ingest is implemented
- if not 'ingest_type' in request:
- request['ingest_type'] = "pdf"
- assert request.get('ingest_type') == "pdf"
ingest_type = request.get('ingest_type')
+ if ingest_type not in ("pdf", "xml", "html"):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
# parse/clean URL
# note that we pass through the original/raw URL, and that is what gets
@@ -329,25 +439,27 @@ class IngestFileWorker(SandcrawlerWorker):
for block in self.base_url_blocklist:
if block in base_url:
- print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
+ print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
return dict(request=request, hit=False, status="skip-url-blocklist")
- print("[INGEST {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
best_mimetype = None
if ingest_type == "pdf":
best_mimetype = "application/pdf"
+ elif ingest_type == "xml":
+ best_mimetype = "text/xml"
+ elif ingest_type == "html":
+ best_mimetype = "text/html"
- existing = self.check_existing_ingest(base_url)
+ existing = self.check_existing_ingest(ingest_type, base_url)
if existing:
return self.process_existing(request, existing)
- result = dict(request=request, hit=False)
+ result: Dict[str, Any] = dict(request=request, hit=False)
next_url = base_url
hops = [base_url]
- self.max_hops = 6
-
while len(hops) <= self.max_hops:
@@ -400,25 +512,9 @@ class IngestFileWorker(SandcrawlerWorker):
result['error_message'] = str(e)[:1600]
return result
- if not resource.hit:
- result['status'] = resource.status
- if resource.terminal_url:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
- if resource.terminal_url not in result['hops']:
- result['hops'].append(resource.terminal_url)
- return result
-
- if not resource.body:
- result['status'] = 'null-body'
- return result
- file_meta = gen_file_metadata(resource.body)
+ assert resource
- if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url):
- result['status'] = 'blocked-cookie'
+ if resource.terminal_url:
result['terminal'] = {
"terminal_url": resource.terminal_url,
"terminal_dt": resource.terminal_dt,
@@ -426,53 +522,47 @@ class IngestFileWorker(SandcrawlerWorker):
}
if resource.terminal_url not in result['hops']:
result['hops'].append(resource.terminal_url)
+
+ if not resource.hit:
+ result['status'] = resource.status
return result
- # crude handling of content-encoding; wayback fetch library usually
- # (and should always?) handle this
- if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
- print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
- try:
- inner_body = gzip.decompress(resource.body)
- except Exception as e:
- result['status'] = 'bad-gzip-encoding'
- result['error_message'] = str(e)
- return result
- if not inner_body:
- result['status'] = 'null-body'
- return result
- resource = ResourceResult(
- body=inner_body,
- # copy all other fields
- start_url=resource.start_url,
- hit=resource.hit,
- status=resource.status,
- terminal_url=resource.terminal_url,
- terminal_dt=resource.terminal_dt,
- terminal_status_code=resource.terminal_status_code,
- cdx=resource.cdx,
- revisit_cdx=resource.revisit_cdx,
- )
- file_meta = gen_file_metadata(resource.body)
-
- if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype'] or "text/xml" in file_meta['mimetype']:
+ if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url):
+ result['status'] = 'blocked-cookie'
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result['status'] = 'bad-gzip-encoding'
+ result['error_message'] = str(e)
+ return result
+
+ if not resource.body or file_meta['size_bytes'] == 0:
+ result['status'] = 'null-body'
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta['mimetype']
+ or "xhtml" in file_meta['mimetype']
+ or "application/xml" in file_meta['mimetype']
+ or "text/xml" in file_meta['mimetype']
+ )
+ if ingest_type == "pdf" and html_ish_resource:
# Got landing page or similar. Some XHTML detected as "application/xml"
- if resource.terminal_dt:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
- result['html'] = fulltext_url
+ result['extract_next_hop'] = fulltext_url
+
if not fulltext_url:
result['status'] = 'no-pdf-link'
return result
next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
assert next_url
next_url = clean_url(next_url)
- print("[PARSE\t] {}\t{}".format(
+ print("[PARSE {:>6}] {} {}".format(
+ ingest_type,
fulltext_url.get('technique'),
next_url,
),
@@ -483,7 +573,44 @@ class IngestFileWorker(SandcrawlerWorker):
return result
hops.append(next_url)
continue
-
+ elif ingest_type == "xml" and html_ish_resource:
+ # parse with selectolax, extract XML fulltext URL
+ html_doc = HTMLParser(resource.body)
+ extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, XML_FULLTEXT_PATTERNS)
+ if extract_next_hop:
+ next_url = extract_next_hop[0]
+ technique = extract_next_hop[1]
+ print("[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr)
+ if next_url in hops:
+ result['status'] = 'link-loop'
+ result['error_message'] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif ingest_type == "html" and html_ish_resource:
+ # parse with selectolax, extract XML fulltext URL
+ html_doc = HTMLParser(resource.body)
+ extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, HTML_FULLTEXT_PATTERNS)
+ if extract_next_hop:
+ next_url = extract_next_hop[0]
+ technique = extract_next_hop[1]
+ if next_url in hops:
+ # for HTML ingest, we don't count this as a link-loop
+ break
+ print("[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr)
+ hops.append(next_url)
+ continue
+
# default is to NOT keep hopping
break
@@ -491,6 +618,11 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "max-hops-exceeded"
return result
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit == True
+ assert resource.terminal_status_code in (200, 226)
+
if resource.terminal_url:
result['terminal'] = {
"terminal_url": resource.terminal_url,
@@ -499,35 +631,50 @@ class IngestFileWorker(SandcrawlerWorker):
"terminal_sha1hex": file_meta['sha1hex'],
}
- # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
- assert resource.hit == True
- assert resource.terminal_status_code in (200, 226)
-
result['file_meta'] = file_meta
result['cdx'] = cdx_to_dict(resource.cdx)
if resource.revisit_cdx:
result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
- # other failure cases
- if not resource.body or file_meta['size_bytes'] == 0:
- result['status'] = 'null-body'
- return result
-
- if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
- result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
- return result
+ if ingest_type == "pdf":
+ if file_meta['mimetype'] != "application/pdf":
+ result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
+ result['status'] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta['mimetype'] not in ("text/html",):
+ result['status'] = "wrong-mimetype"
+ return result
+ else:
+ raise NotImplementedError()
- info = self.process_hit(resource, file_meta)
+ info = self.process_hit(ingest_type, resource, file_meta)
result.update(info)
+ # check if processing turned up an error
+ if info.get('status') not in ('success', None):
+ result['status'] = info['status']
+ return result
+
result['status'] = "success"
result['hit'] = True
- print("[SUCCESS\t] sha1:{} grobid:{} pdfextract:{}".format(
- result.get('file_meta', {}).get('sha1hex'),
- result.get('grobid', {}).get('status_code'),
- result.get('pdf_meta', {}).get('status'),
- ),
- file=sys.stderr)
+ if ingest_type == "pdf":
+ print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
+ ingest_type,
+ result.get('file_meta', {}).get('sha1hex'),
+ result.get('grobid', {}).get('status_code'),
+ result.get('pdf_meta', {}).get('status'),
+ ),
+ file=sys.stderr)
+ else:
+ print("[SUCCESS {:>5}] sha1:{}".format(
+ ingest_type,
+ result.get('file_meta', {}).get('sha1hex'),
+ ),
+ file=sys.stderr)
return result
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 8b02211..c7deea1 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -17,8 +17,8 @@ class SandcrawlerMinioClient(object):
Example config:
host="localhost:9000",
- access_key=os.environ['MINIO_ACCESS_KEY'],
- secret_key=os.environ['MINIO_SECRET_KEY'],
+ access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
"""
self.mc = minio.Minio(
host_url,
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 1b8aa92..67e5c0b 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -3,20 +3,22 @@ import base64
import magic
import hashlib
import datetime
+from typing import Optional
+
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
import urlcanon
-def clean_url(s):
+def clean_url(s: str) -> str:
s = s.strip()
parsed = urlcanon.parse_url(s)
if not parsed.port and parsed.colon_before_port:
parsed.colon_before_port = b''
return str(urlcanon.whatwg(parsed))
-def gen_file_metadata(blob):
+def gen_file_metadata(blob: bytes) -> dict:
"""
Takes a file blob (bytestream) and returns hashes and other metadata.
@@ -24,6 +26,10 @@ def gen_file_metadata(blob):
"""
assert blob
mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if mimetype in ("application/xml", "text/xml"):
+ # crude check for JATS XML, using only first 1 kB of file
+ if b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
hashlib.sha256(),
@@ -39,7 +45,7 @@ def gen_file_metadata(blob):
mimetype=mimetype,
)
-def b32_hex(s):
+def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -62,7 +68,7 @@ NORMAL_MIME = (
'application/octet-stream',
)
-def normalize_mime(raw):
+def normalize_mime(raw: str) -> Optional[str]:
raw = raw.lower().strip()
for norm in NORMAL_MIME:
if raw.startswith(norm):
@@ -103,7 +109,7 @@ def test_normalize_mime():
assert normalize_mime("binary/octet-stream") == "application/octet-stream"
-def parse_cdx_line(raw_cdx, normalize=True):
+def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
"""
This method always filters a few things out:
@@ -138,32 +144,45 @@ def parse_cdx_line(raw_cdx, normalize=True):
mime = normalize_mime(mime)
sha1hex = b32_hex(sha1b32)
- http_status = int(http_status)
- c_size = int(c_size)
- offset = int(offset)
return dict(
surt=surt,
url=url,
datetime=dt,
mimetype=mime,
- http_status=http_status,
+ http_status=int(http_status),
sha1b32=sha1b32,
sha1hex=sha1hex,
- warc_csize=c_size,
- warc_offset=offset,
+ warc_csize=int(c_size),
+ warc_offset=int(offset),
warc_path=warc,
)
-def parse_cdx_datetime(dt_str):
+def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
+ if not dt_str:
+ return None
try:
- return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+ return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S")
except Exception:
return None
+def test_parse_cdx_datetime() -> None:
+ assert parse_cdx_datetime("") == None
+ assert parse_cdx_datetime("asdf") == None
+ assert parse_cdx_datetime("19930203123045") != None
+ assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+
+def datetime_to_cdx(dt: datetime.datetime) -> str:
+ return '%04d%02d%02d%02d%02d%02d' % (
+ dt.year, dt.month, dt.day,
+ dt.hour, dt.minute, dt.second,
+ )
+
+def test_datetime_to_cdx() -> None:
+ assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
def requests_retry_session(retries=10, backoff_factor=3,
- status_forcelist=(500, 502, 504), session=None):
+ status_forcelist=(500, 502, 504), session=None) -> requests.Session:
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index fbc5273..f13b1f3 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,7 +20,7 @@ grobid
"""
import os
-from typing import Optional
+from typing import Optional, AnyStr
import xml.etree.ElementTree
from sandcrawler.workers import SandcrawlerWorker
@@ -28,6 +28,7 @@ from sandcrawler.db import SandcrawlerPostgresClient
from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.html_ingest import HtmlMetaRow
class PersistCdxWorker(SandcrawlerWorker):
@@ -95,8 +96,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if not k in raw:
self.counts['skip-request-fields'] += 1
return None
- if raw['ingest_type'] not in ('pdf', 'xml'):
- print(raw['ingest_type'])
+ if raw['ingest_type'] not in ('pdf', 'xml', 'html'):
self.counts['skip-ingest-type'] += 1
return None
request = {
@@ -121,7 +121,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
return request
- def file_result_to_row(self, raw):
+ def file_result_to_row(self, raw: dict) -> Optional[dict]:
"""
Converts ingest-result JSON schema (eg, from Kafka) to SQL ingest_file_result schema
@@ -137,7 +137,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
ingest_type = raw['request'].get('ingest_type')
if ingest_type == 'file':
ingest_type = 'pdf'
- if ingest_type not in ('pdf', 'xml'):
+ if ingest_type not in ('pdf', 'xml', 'html'):
self.counts['skip-ingest-type'] += 1
return None
if raw['status'] in ("existing", ):
@@ -159,6 +159,22 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
return result
+ def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
+ html_body = record.get('html_body')
+ file_meta = record.get('file_meta')
+ if not (file_meta and html_body):
+ return None
+ return HtmlMetaRow(
+ sha1hex=file_meta["sha1hex"],
+ status=record.get('status'),
+ scope=record.get('scope'),
+ has_teixml=bool(html_body and html_body['status'] == 'success' and html_body.get('tei_xml')),
+ has_thumbnail=False, # TODO
+ word_count=(html_body and html_body.get('word_count')) or None,
+ biblio=record.get('html_biblio'),
+ resources=record.get('html_resources'),
+ )
+
def push_batch(self, batch):
self.counts['total'] += len(batch)
@@ -197,6 +213,12 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
self.counts['insert-file_meta'] += resp[0]
self.counts['update-file_meta'] += resp[1]
+ html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')]
+ if html_meta_batch:
+ resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="nothing")
+ self.counts['insert-html_meta'] += resp[0]
+ self.counts['update-html_meta'] += resp[1]
+
self.db.commit()
return []
@@ -452,9 +474,11 @@ class PersistPdfTextWorker(SandcrawlerWorker):
class PersistThumbnailWorker(SandcrawlerWorker):
"""
- Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table.
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL
+ table.
- This worker *must* be used with raw kakfa mode.
+ This worker *must* be used with raw kakfa mode; thumbnails are *not*
+ wrapped in JSON like most sandcrawler kafka messages.
"""
def __init__(self, **kwargs):
@@ -487,3 +511,70 @@ class PersistThumbnailWorker(SandcrawlerWorker):
)
self.counts['s3-put'] += 1
+
+class GenericPersistDocWorker(SandcrawlerWorker):
+ """
+ Pushes blobs from Kafka to S3.
+
+ Objects are assumed to be JSON-wrapped strings.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get('s3_url', 'localhost:9000'),
+ access_key=kwargs['s3_access_key'],
+ secret_key=kwargs['s3_secret_key'],
+ default_bucket=kwargs['s3_bucket'],
+ )
+ self.s3_extension = kwargs.get('s3_extension', ".unknown")
+ self.s3_folder = kwargs.get('s3_folder', "unknown")
+ self.doc_key = "unknown"
+
+ def process(self, record: dict, key: Optional[AnyStr] = None) -> None:
+
+ if record.get('status') != 'success' or not record.get(self.doc_key):
+ return
+
+ assert key is not None
+ if isinstance(key, bytes):
+ key_str = key.decode('utf-8')
+ elif isinstance(key, str):
+ key_str = key
+ assert len(key_str) == 40
+ if 'sha1hex' in record:
+ assert key_str == record['sha1hex']
+
+ resp = self.s3.put_blob(
+ folder=self.s3_folder,
+ blob=record[self.doc_key].encode('utf-8'),
+ sha1hex=key_str,
+ extension=self.s3_extension,
+ )
+ self.counts['s3-put'] += 1
+
+
+class PersistXmlDocWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get('s3_extension', ".jats.xml")
+ self.s3_folder = kwargs.get('s3_folder', "xml_doc")
+ self.doc_key = "jats_xml"
+
+
+class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get('s3_extension', ".tei.xml")
+ self.s3_folder = kwargs.get('s3_folder', "html_body")
+ self.doc_key = "tei_xml"
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
new file mode 100644
index 0000000..7a0086d
--- /dev/null
+++ b/python/sandcrawler/xml.py
@@ -0,0 +1,7 @@
+
+import xml.etree.ElementTree as ET
+
+
+def xml_reserialize(raw: bytes) -> str:
+ root = ET.fromstring(raw)
+ return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode")
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 77c0704..6be8bac 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -3,7 +3,7 @@
"""
These are generally for continuously running workers that consume from Kafka.
Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
-or minio.
+or S3 (SeaweedFS).
"""
import os
@@ -13,6 +13,7 @@ import datetime
import raven
from sandcrawler import *
+from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
try:
@@ -148,6 +149,42 @@ def run_persist_thumbnail(args):
)
pusher.run()
+def run_persist_xml_doc(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.xml-doc"
+ worker = PersistXmlDocWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-xml-doc",
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_html_teixml(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.html-teixml"
+ worker = PersistHtmlTeiXmlWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-html-teixml",
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
def run_persist_pdftrio(args):
consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
worker = PersistPdfTrioWorker(
@@ -174,6 +211,8 @@ def run_ingest_file(args):
grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic,
@@ -193,12 +232,22 @@ def run_ingest_file(args):
kafka_hosts=args.kafka_hosts,
produce_topic=thumbnail_topic,
)
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
worker = IngestFileWorker(
grobid_client=grobid_client,
sink=sink,
grobid_sink=grobid_sink,
thumbnail_sink=thumbnail_sink,
pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
# don't SPNv2 for --bulk backfill
try_spn2=not args.bulk,
)
@@ -242,16 +291,16 @@ def main():
help="postgresql database connection string",
default="postgres:///sandcrawler")
parser.add_argument('--s3-url',
- help="S3 (minio) backend URL",
+ help="S3 (seaweedfs) backend URL",
default="localhost:9000")
parser.add_argument('--s3-access-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_ACCESS_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
parser.add_argument('--s3-secret-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_SECRET_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))
parser.add_argument('--s3-bucket',
- help="S3 (minio) bucket to persist into",
+ help="S3 (seaweedfs) bucket to persist into",
default="sandcrawler-dev")
subparsers = parser.add_subparsers()
@@ -264,7 +313,7 @@ def main():
sub_pdf_extract.set_defaults(func=run_pdf_extract)
sub_persist_grobid = subparsers.add_parser('persist-grobid',
- help="daemon that consumes GROBID output from Kafka and pushes to minio and postgres")
+ help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")
sub_persist_grobid.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
@@ -274,7 +323,7 @@ def main():
sub_persist_grobid.set_defaults(func=run_persist_grobid)
sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
- help="daemon that consumes pdftext output from Kafka and pushes to minio and postgres")
+ help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")
sub_persist_pdftext.add_argument('--s3-only',
action='store_true',
help="only upload TEI-XML to S3 (don't write to database)")
@@ -284,9 +333,17 @@ def main():
sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
- help="daemon that consumes thumbnail output from Kafka and pushes to minio and postgres")
+ help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
+ sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc',
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
+
+ sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml',
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
+
sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
@@ -304,7 +361,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.func(args)
diff --git a/python/tests/files/dlib_05vanhyning.html b/python/tests/files/dlib_05vanhyning.html
new file mode 100644
index 0000000..dbe3ef7
--- /dev/null
+++ b/python/tests/files/dlib_05vanhyning.html
@@ -0,0 +1,350 @@
+<!DOCTYPE html>
+<html lang="en" itemscope itemtype="http://schema.org/Article">
+<head>
+<script type="text/javascript" src="/js/ga.js"></script>
+<style type="text/css">
+
+.topLeft { border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftThick { border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftRight {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftRightThick {border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftBottom {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.all {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+table.plain {border-collapse: separate;
+ border-spacing: 0px;
+ margin-left: auto;
+ margin-right: auto;
+ }
+td.plain {padding: 6px;
+ vertical-align: text-top;
+ }
+
+table.author {border-collapse: separate;
+ border-spacing: 6px;
+ }
+td.authors {padding: 6px;
+ }
+
+li:not(:last-child) {
+ margin-bottom: .5em;
+ }
+
+div.center {margin-left: auto; margin-right: auto;
+ }
+
+</style>
+<meta charset="utf-8" />
+<meta id="DOI" content="10.1045/may2017-vanhyning" />
+<meta itemprop="datePublished" content="2017-05-15" />
+<meta id="description" content="D-Lib Magazine Article" />
+<meta id="keywords" content="Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS" />
+<link href="../../../style/style1.css" rel="stylesheet" type="text/css" />
+
+<title>Transforming Libraries and Archives through Crowdsourcing</title>
+</head>
+
+<body>
+<form action="/cgi-bin/search.cgi" method="get">
+
+<div style="height:2px;background:#2b538e"></div>
+<div style="height:4px;background:#4078b1"></div>
+
+<div style="height:30px;background:#4078b1">
+
+<span style="color: #ffffff; font-size: 12px; float: right; margin-right: 10px;">Search D-Lib:
+<input type="text" id="words" value="" size="25" />
+<input type="submit" id="search" value="Go!" />
+<input type="hidden" id="config" value="htdig" />
+<input type="hidden" id="restrict" value="" />
+<input type="hidden" id="exclude" value="" />
+</span>
+</div>
+
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:1px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:1px;background:#2b538e"></div>
+<div style="height:92px;background:#4078b1"><img width="450" height="90" alt="D-Lib-blocks5" src="../../../img2/D-Lib-blocks5.gif">
+</div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#e04c1e"></div>
+<div style="height:24px;background:#eda443"><img src="../../../img2/magazine5.gif" alt="The Magazine of Digital Library Research" width="830" height="24" /></div>
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:28px;background:#2b538e">
+<div id="navtable">
+<table>
+<tr><td class="navtext"><img src="../../../img2/transparent.gif" alt="" width="20" height="20" /><a href="../../../dlib.html">HOME</a>&nbsp;|&nbsp;<a href="../../../about.html">ABOUT D-LIB</a>&nbsp;|&nbsp;<a href="../../../contents.html" class="navtext">CURRENT ISSUE</a>&nbsp;|&nbsp;<a href="../../../back.html">ARCHIVE</a>&nbsp;|&nbsp;<a href="../../../author-index.html">INDEXES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/groups.html">CALENDAR</a>&nbsp;|&nbsp;<a href="../../author-guidelines.html">AUTHOR GUIDELINES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/mailman/listinfo/dlib-subscribers">SUBSCRIBE</a>&nbsp;|&nbsp;<a href="../../letters.html">CONTACT D-LIB</a></td></tr></table></div></div>
+<div style="height:4px;background:#2b538e"></div>
+<div style="height:1px;background:#e04c1e"></div>
+
+<div style="padding-left: 2.5em; padding-top: 1em;">
+
+<h3 class="blue-space">D-Lib Magazine</h3>
+<p class="blue">May/June 2017<br />
+Volume 23, Number 5/6<br />
+<a href="../05contents.html">Table of Contents</a>
+</p>
+
+<div class="divider-full">&nbsp;</div>
+
+<h3 class="blue-space">Transforming Libraries and Archives through Crowdsourcing</h3>
+
+<p class="blue">Victoria Van Hyning, University of Oxford, Zooniverse<br />
+victoria [at] zooniverse.org<br /><br />
+
+Samantha Blickhan, The Adler Planetarium, Zooniverse<br />
+samantha [at] zooniverse.org<br /><br />
+
+Laura Trouille, The Adler Planetarium, Zooniverse<br />
+trouille [at] zooniverse.org<br /><br />
+
+Chris Lintott, University of Oxford, Zooniverse<br />
+chris [at] zooniverse.org</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p><a href="https://doi.org/10.1045/may2017-vanhyning" class="nolinka">https://doi.org/10.1045/may2017-vanhyning</a></p>
+
+<div class="divider-full">&nbsp;</div>
+ <!-- Abstract or TOC goes here -->
+
+<h3 class="blue">Abstract</h3>
+
+<p class="blue">This article will showcase the aims and research goals of the project entitled "Transforming Libraries and Archives through Crowdsourcing", recipient of a 2016 Institute for Museum and Library Services grant. This grant will be used to fund the creation of four bespoke text and audio transcription projects which will be hosted on the Zooniverse, the world-leading research crowdsourcing platform. These transcription projects, while supporting the research of four separate institutions, will also function as a means to expand and enhance the Zooniverse platform to better support galleries, libraries, archives and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.</p>
+
+<p class="blue">Keywords: Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS</p>
+
+<!-- Article goes next -->
+
+<div class="divider-full">&nbsp;</div>
+<h3>1 Overview<span style="vertical-align: super;"><a href="#n6">1</a></span></h3>
+
+<p>As libraries, museums, and other cultural repositories digitize their collections and place them online, the challenges of transforming these materials into useful and searchable sources of information are becoming increasingly apparent. While OCR and handwriting recognition technology have opened up some print and manuscript corpora, and image and voice recognition software are improving daily, there are still many tasks that require human intervention. For these, volunteer crowdsourcing is a viable and vibrant solution.</p>
+
+<p>The <a href="https://www.zooniverse.org/">Zooniverse</a> is the world-leading research crowdsourcing platform, hosting over 50 active projects and over 100 projects total since its inception in 2007. The projects cover diverse subject areas from astronomy to zoology, engage over 1.5 million registered volunteers, and have produced data used in more than a hundred peer-reviewed articles.<span style="vertical-align: super;"><a href="#n1">2</a></span> The Zooniverse also hosts the <a href="https://www.zooniverse.org/lab">Project Builder</a>, a free platform through which anyone can build their own project. The Zooniverse grew from a single project developed at the University of Oxford in 2007, and is now developed and managed by a team based in Oxford and at the Adler Planetarium in Chicago and the University of Minnesota (see <a href="https://www.zooniverse.org/about/team">Zooniverse Team</a> for a more complete list).</p>
+
+<p>In late 2016, the Institute for Museum and Library Services awarded a National Leadership Grant titled "Transforming Libraries and Archives through Crowdsourcing (LG-71-16-0028-16)" to the Adler Planetarium and its collaborators to support the work of the Zooniverse. Through this grant-funded effort, the Zooniverse will further expand and enhance its platform to better support galleries, libraries, archives, and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.1 What Can Crowdsourcing Offer GLAMs?</h4>
+
+<p>In 2010, author and professor Clay Shirky delivered a rousing <a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">TED</a> talk in which he used the phrase "cognitive surplus" to describe the one trillion hours of leisure time humans collectively accumulate each year (a great deal of which is spent watching television), which could be harnessed to advance human knowledge through civic engagement. He concluded that: "free cultures get what they celebrate. [...If we] celebrate and support and reward the people trying to use cognitive surplus to create civic value [...] we'll be able to change society".[<a href="#1">1</a>] One way that GLAMs can harness this cognitive surplus is through web-based crowdsourcing. What Shirky was describing was a type of "social machine", which Tim Berners-Lee defined as "new form[s] of social processes" emergent from the Web, and involving both human and machine components.[<a href="#2">2</a>] </p>
+
+<p>Academic crowdsourcing invites members of the public to work with specialists to conduct research: for example, to transcribe documents or add metadata to a collection of images, video or audio clips. This data is used in real science, social science, or humanities investigations and should, ideally, lead to publication. Crowdsourcing within GLAMs may not always be oriented around a specific research question or publication, but around making collections more accessible for future research and usability. GLAM crowdsourcing can be the seedbed of future scholarly research.</p>
+
+<p>GLAMs have been engaging volunteers with their collections for well over a century, usually by inviting select individuals into an institution and training them to do work that cannot be done by staff due to time or money constraints. On-site volunteers often build up valuable knowledge and skills and contribute a great deal to their chosen institutions, but training and supervising them also poses challenges. There is a limit to how many volunteers can be trained, supported on site, and indeed attracted and retained in the first place. Online volunteering, enabled by crowdsourcing platforms such as Zooniverse.org, offer an alternative or complementary form of engagement that has many benefits. Online projects can reach a wider range of individuals, including those who are less able-bodied or geographically remote from the institution in which they want to volunteer and/or unable to travel. Such projects require less training and time commitment from volunteers and typically attract a larger number of participants than on-site programs. They also enable GLAMs to open up rare collections to the public without concern for their material safety and security.<span style="vertical-align: super;"><a href="#n2">3</a></span></p>
+
+<p>While crowdsourcing projects have proliferated in the last decade, few offer easy to use, open source, and free platforms on which GLAM academics and amateur users can rely. The Zooniverse has the infrastructure, community, and technical expertise to intervene at this critical stage. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.2 How Does The Zooniverse Work?</h4>
+
+<p>All bespoke Zooniverse projects, including those built on the free Project Builder, have a few core components. Each image, audio or video file (data point) in each project is independently assessed by multiple individuals, whose responses are then aggregated using a variety of algorithms to determine what is in a given image. The amount of required responses for a task to be considered "complete" varies, depending on the project. With relatively quick tasks, such as animal identification in Snapshot Serengeti, upwards of 70 people will see each image. In tasks that require more time, such as transcription projects like <a href="https://www.shakespearesworld.org/#!/">Shakespeare's World</a> and <a href="https://anno.tate.org.uk/#!/">AnnoTate</a>, at least three people transcribe each line on each page. If enough people transcribe the same line and our algorithms deem the line to be completed to a good enough standard, these are greyed out, while outstanding lines are available to future site visitors. This approach was designed along the same principles that underpin all other Zooniverse projects, in which it is assumed that volunteers should work independently on tasks, in order that no one individual should have undue influence over others in the crowd. In the current IMLS project, however, we will test whether allowing volunteers to transcribe and work collaboratively ultimately creates better data and/or better user experiences. We will be able to compare datasets from AnnoTate and Shakespeare's World with text transcription datasets from the two new bespoke text transcription projects and, hopefully, with datasets generated at other institutions that have online crowdsourcing projects. Zooniverse is in a unique position in being able to gather these two very different kinds of data and compare them in order to determine the best outcomes. These findings will ultimately drive our design of free tools on the Project Builder.
+
+<p>In addition to participating in the classification task, users have the opportunity to communicate with other volunteers through an active, object-oriented discussion forum, called "Talk", associated with each project. Here volunteers can ask questions, interact with researchers and fellow volunteers, create their own "collections", and use hashtags to group together posts or images of interest. An example of the latter is <a href="https://talk.sciencegossip.org/#/search?tags%5Bfemale%5D=true">#female</a> from the <a href="https://www.sciencegossip.org/">Science Gossip</a> project, which indicates female authors, illustrators and printers contributing to the main scientific journals in the nineteenth century (visit the <a href="https://talk.sciencegossip.org/#/boards/BSC0000004/discussions/DSC00004s8">Science Gossip Talk</a> board to view the discussion around this tag). These interactions provide a rich set of experiences that allow users to personally experience the community in which they are participating, beyond simply providing classifications. Additionally, the collections allow volunteers to create their own research focal points within existing projects. During the process of transcribing, users can save images that contain content that is pertinent to their research interests by adding them to a public collection. They can then use the Talk forum to publicize their search, allowing other users to add images to that collection as well. In this way, the volunteer base can be mobilized to help other volunteers with minimal effort required.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>2 IMLS Funded Effort: Approach and Focus</h3>
+
+<p>Through the IMLS grant, the Zooniverse will engage in a research and development program to identify and implement crowdsourcing best practices in the arenas of text and audio transcription for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read. Though to date the majority of Zooniverse projects have been based in STEM fields rather than in the humanities, several text transcription projects have already been hosted on the site. For example, the first Zooniverse humanities project was <a href="https://www.ancientlives.org/">Ancient Lives</a>, which invited volunteers to transcribe ancient papyri one letter at a time using a clickable keyboard on their screen: volunteers did not have to be fluent in ancient Greek, they only needed to character match. Over 250,000 volunteers participated in the project, and made more than 1.5 million transcriptions between 2011 and 2014.[<a href="#6">3</a>] Furthermore, the computational pipeline used to convert individual identified letters into consensus-based transcriptions will benefit future classification projects attempting consensus letter or line sequence identifications.[<a href="#7">4</a>]</p>
+
+<p>By 2018 we will build four bespoke projects, two projects for text transcription and two projects for audio transcription, identified through open calls, in order to test, iterate, and research the efficacy of new and existing approaches (including within current Zooniverse and other projects) in these arenas. We will also develop the foundation for a GLAM-friendly data pipeline to export data from a Zooniverse project into GLAM collections. These functionalities are among those most frequently requested by GLAM institutions. We will work closely with four different GLAM institutions to build these bespoke crowdsourcing projects and functionalities. The text transcription open call closed in February 2017, with thirty-one submissions. The audio transcription open call will occur in fall 2017 (see <a href="http://zooniverse.org/get-involved/call-for-projects">Call for Projects</a>).</p>
+
+<p>From the lessons learned in building these bespoke projects, we will explore adding new tools and functionality to the Project Builder, which is freely available to any institution or user who wishes to lead a project. It is a flexible, powerful, and easy-to-use resource for building crowdsourcing projects, with a wide range of potential applications for GLAM collections, including text transcription. A basic text transcription tool is currently available, but will be refined through this grant effort. The Zooniverse has previously used this model of building bespoke projects in order to learn which tools are most useful, before implementing these tools in the Project Builder. We recognize that volunteers' time is precious, and are therefore unwilling to waste it with tools that are not proven to extract data in an efficient, high quality, and useful form. We will also draw on lessons learned from previous experiences supporting transcription projects through Zooniverse and other platforms. For example, <a href="https://www.operationwardiary.org/">Operation War Diary</a> which launched in 2014 to commemorate the outbreak of the First World War, is a partnership between the National Archives (UK), the Imperial War Museum, and the Zooniverse, which invites users to tag and transcribe dates, times, places, and names found in British WWI field diaries. Historian Richard Grayson has used the data to penetrate more deeply than ever before into records of soldiers' daily lives on the front.[<a href="#8">5</a>] All of the Operation War Diary metadata will eventually be integrated into the National Archive catalogues. The process of integrating new metadata into an existing catalogue can be complicated, raising an important question for any GLAM specialist seeking to harness crowdsourcing at their institution. For instance, it is essential to ensure, before starting a project, that the current content management system (CMS) supports the storage of additional metadata, such as large amounts of free-text. If not, it then becomes necessary to use an external resource to make available the results from the crowdsourcing project. Zooniverse can and will do more to facilitate GLAMs and research groups to use and store their data.</p>
+
+<p>Over the course of the IMLS project, we will also address the following research questions:</p>
+
+<p class="indentLeft">Q1: How can crowdsourcing be deployed in the arenas of text and audio transcription and metadata extraction for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read? What methods produce the best data and make for the best user experience?</p>
+
+<p class="indentLeft">Q2: Does the current Zooniverse methodology of multiple independent transcribers and aggregation render better results than allowing volunteers to see previous transcriptions by others or indeed collaborate to create a single transcription? How does each methodology impact the quality of data, as well as depth of analysis and participation?</p>
+
+<p class="indentLeft">Q3: How can we extend our crowdsourcing expertise to more GLAM professionals and learn from them, in turn, how to adjust the Zooniverse platform to best meet their research and curatorial needs?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.1 Addressing Q1 (Crowdsourcing for GLAM)</h4>
+
+<p>Only a platform like the Zooniverse can systematically address a question such as Q1: the community that has developed within the platform is made up of volunteers who move across projects, allowing us to trace the impact of differences between projects on the same volunteers. Zooniverse also has the infrastructure to implement A/B split experiments within a single project. This allows us to develop projects incorporating different practices which are specifically aimed at understanding different methodologies. Through the bespoke text and audio transcription projects, we will expand on the lessons learned through current Zooniverse text transcription projects, including Ancient Lives, AnnoTate, Old Weather, Measuring the ANZACs, Shakespeare's World, Science Gossip, Decoding the Civil War, Orchid Observers and Operation War Diary, as well as from external text transcription projects including <a href="http://blogs.ucl.ac.uk/transcribe-bentham/">Transcribe Bentham</a>, <a href="http://fromthepage.com/">FromthePage</a>, and <a href="http://scripto.org/">Scripto</a>. </p>
+
+<p>In the bespoke projects created through the IMLS grant, the features optimizing volunteer engagement and retention will include: </p>
+
+<ul>
+ <li><i>Volunteer choice:</i> volunteers choose which document to transcribe and can transcribe as little as a single line or as much as an entire document. We have found through AnnoTate and Shakespeare's World that allowing users to transcribe smaller fragments of text (without being required to complete an entire page) mitigates against forced or uncertain readings. We hypothesize and plan to fully test whether allowing microtasking helps to retain volunteers, giving them the chance to build up their skills and not make forced readings. </li>
+
+ <li><i>Keeping the task simple:</i> in Shakespeare's World and AnnoTate, volunteers drop points at the start and end of individual lines of text (not grammatical sentences) and transcribe the text contained between these two points. They do not use XML markup itself, which has proven to be a major repellent to participants in other text transcription crowdsourcing projects.<span style="vertical-align: super;"><a href="#n3">4</a></span> Instead, volunteers highlight words within the transcribed line and choose among different features (e.g., insertion, deletion, expansion, etc.). We propose to use these tagged words in each line to create simple TEI markup on the back-end, for output into commonly used CMSs such as Drupal and Omeka.</li>
+
+ <li><i>Narrowing the content focus to support sense-making:</i> In Shakespeare's World, the first release (or "chapter") consists of recipes and letters, with more genres to follow. This type of structured approach will be applied to the bespoke projects, as this supports creation of narratives within diverse collections, which in turn enables subject experts to more easily foster, and volunteers to contribute to, discussions in Talk.</li>
+</ul>
+
+<p>Features optimizing best practice in regard to data production and management will include:</p>
+
+<ul>
+ <li><i>Reliable, Scalable, Open Source Code Infrastructure:</i> The foundation for the Zooniverse platform that includes the Project Builder is an application written in Ruby on Rails which supports a powerful Application Programming Interface (API). The API serves subjects &#151; images, video or audio &#151; for classification by volunteers via a workflow defined by the project, and receives and records these classifications into a database. The frontend Javascript web software presents user interfaces to volunteers and supports the Project Builder. All Zooniverse code is open source and available through <a href="github.com/zooniverse">Github</a>.</li>
+
+ <li><i>Data Ingestion into Zooniverse:</i> In the current Project Builder, research teams can upload batches of 500 to 1000 subjects (images, videos, or audio clips) at a time by simply dragging and dropping the files. For larger collections and for bespoke projects, typically the research team provides a hard drive and the Zooniverse team uploads the subjects to the API. Through the projects proposed here, we will create a system to better support direct ingestion of large subject sets through a user-friendly web interface, adding functionality to the foundation we already have in place within the Project Builder.</li>
+
+ <li><i>Useful Output for Curation:</i> The Smithsonian Transcription Center is regularly cited as being successful in regard to their output being easily ingestible by CMSs.[<a href="#9">6</a>] Current Zooniverse transcription projects are not set up with this functionality. Currently, through our Project Builder for image annotation/marking projects, research teams can download the raw classification results (i.e. all classifications by all volunteers) as well as automatically-generated aggregated results that include confidence measures on consensus. Through this IMLS-funded effort, we will work with Meghan Ferriter of the Smithsonian Transcription Center, who is on our board of advisors, to design data outputs for full text transcription and full audio transcription that are suitable for ingestion into different GLAM CMSs. A key aspect of this effort is to continue exploring best practices and approaches for transcription aggregation and confidence metrics, building on our efforts with AnnoTate, Shakespeare's World, etc.</li>
+</ul>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.2 Addressing Research Q2 (Independent vs. Collaborative Transcription)</h4>
+
+<p>Through the two bespoke text transcription projects, we will investigate the impact on transcription quality and volunteer experience when volunteers transcribe in isolation versus with knowledge of how others have transcribed the same document. </p>
+
+<p>In terms of measuring impact on transcription quality, we will compare the rate of accuracy for individuals who transcribe in isolation on projects such as AnnoTate and Shakespeare's World versus individuals who see previous transcriptions. We will also compare the rate of accuracy in aggregated results for lines transcribed only by those working in isolation versus for lines in which all but the first transcriber sees previous transcriptions. In order to measure impact on volunteer experience, we will analyze the user behavior statistics we gather, e.g., number of transcriptions completed in a given session, length of session, number of sessions overall, sentiment analysis of discussion forum comments, etc.</p>
+
+<p>There are numerous open questions in this experiment: Does knowledge of other individuals' or collective transcriptions lead individuals down the wrong path? Is transcription more or less accurate if people work in isolation or with an awareness of other people's work? Does making transcriptions visible increase retention as a result of highlighting that an individual's effort is part of a broader community effort or have the opposite effect? What environment best promotes skills acquisition, i.e. improved paleography?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.3 Addressing Research Q3 (Feedback/Training)</h4>
+
+<p>We will provide numerous opportunities for input and feedback from and training for the GLAM community, specifically by working closely with our advisory board and four GLAM project partners throughout. In 2018 we will host feedback sessions at GLAM conferences and summer schools targeting GLAM institutions with collections for which text transcription, audio transcription, or image annotation/marking are of interest (we will include image annotation/marking because those tools are already included via the Project Builder). This will allow for input from a broader set of institutions on our decisions and approach for building new functionality into the Project Builder. In 2018&#151;2019 we will host training workshops for GLAM professionals in using the Project Builder to build their own crowdsourcing projects, incorporate the results into their databases and research, and sustain and nurture their online volunteer communities.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>3 Future Steps: Community Engagement, Output &amp; How to Get Involved</h3>
+
+<p>The IMLS-Funded Project "Transforming Libraries and Archives through Crowdsourcing" is still in its beginning stages. Currently, we are in the process of selecting the first two bespoke crowdsourcing text transcription projects to be built and incorporated into the Zooniverse platform. The detail of our research questions will evolve alongside these new transcription projects, and during the research and development process we will use conference presentations and feedback sessions to gather input which can then guide the overall project design. The open call for the two bespoke audio transcription projects will occur in the fall of 2017. At this point, the bespoke text transcriptions will be in beta review, allowing us to take advantage of lessons learned through that first round of new projects. We believe that this self-reflexive method will simultaneously benefit our ongoing project while offering new tools and ideas to the larger GLAM and academic community.</p>
+
+<p>We anticipate this proposed effort will produce two peer-reviewed publications. One article will focus on the methodology for creating, processing, and evaluating the data produced by the new projects. The second will focus on the results of our research exploring the impact of individual versus collaborative text transcription. We also note that all Zooniverse <a href="github.com/zooniverse">code</a> is freely available under a liberal open source license which serves as an additional or parallel form of publication.</p>
+
+<p>GLAM organizations keen to develop their own crowdsourcing projects should explore the available documentation on <a href="https://www.zooniverse.org/lab-how-to">how to build a project</a> and <a href="https://www.zooniverse.org/lab-best-practices/great-project">best practices for the design, launch and long term phases of a project</a>. While building a project is easy and requires relatively little technical support from Zooniverse or your institution, make sure you have the time to work with your resulting data, and time to support your online volunteer commmunity. Advertising the project's existence should be a long-term task, to avoid a plateau or potential drop-off of user participation. For example, Shakespeare's World received a bump in the number of daily classifications after an article was published in The New Yorker in January of 2017, over a year after the project's launch date.[<a href="#10">7</a>] However, it does not suffice to merely advertise the existence of a project; researchers need to engage with their users on a regular basis.<span style="vertical-align: super;"><a href="#n5">5</a></span> Zooniverse's Talk platform, social media such as blogging, Twitter, Instagram, and indeed in-person or on-site events all provide important channels for engaging current or potential volunteers with your collections. We believe that GLAM organizations, with their long history of volunteer engagement, have many of the skills to work effectively with online volunteers, and will benefit in new ways through cooperation with the crowd.</p>
+
+<p>In conclusion, while this project is specifically focused on text and audio transcription, it is our hope that the results, including the new Project Builder tools and GLAM data pipeline, will ultimately be used across a variety of disciplines and domains. We hope to facilitate future partnerships between GLAM institutions and volunteer communities around the world, thus extending the aims and outcomes of the National Digital Platform funded through this generous IMLS grant into an international digital platform that will benefit many individuals and institutions. </p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>Notes</h3>
+
+<table style="width:90%">
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n6">1</a></td>
+<td style="padding-top: .5em;">Part of this article appeared previously as a blog post for CILIP, The Library and Information Association. Material is reproduced by express permission of CILIP.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n1">2</a></td>
+<td style="padding-top: .5em;">For a partial list of publications, please visit <a href="https://www.zooniverse.org/about/publications">https://www.zooniverse.org/about/publications</a>. </td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n2">3</a></td>
+<td style="padding-top: .5em;">Further discussion of the use of crowdsourcing in GLAM contexts can be found in Melissa Terras, "Crowdsourcing in the Digital Humanities", in <i>A New Companion to Digital Humanities</i>, eds. Susan Schreibman, Ray Siemens, and John Unsworth (John Wiley &amp; Sons, 2016), 420-438, particularly in the section entitled "The Growth of Crowdsourcing in Cultural and Heritage Applications" (pp. 423-28). See also <i>Crowdsourcing Our Cultural Heritage</i>, ed. Mia Ridge (Ashgate, 2014).</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n3">4</a></td>
+<td style="padding-top: .5em;">Causer and Terras, "Many Hands Make Light Work", p. 81: "It would be fair to say that for volunteers, the XML mark-up complicates participation, and it has undoubtedly dissuaded many from participating more fully, or at all." For opinions from the volunteers about the process, the authors additionally refer the reader to Causer and Valerie Wallace, "<a href="http://www.digitalhumanities.org/dhq/vol/6/2/000125/000125.html">Building a Volunteer Community: Results and Findings from Transcribe Bentham</a>", <i>Digital Humanities Quarterly</i> 6.2 (2012).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n5">5</a></td>
+<td style="padding-top: .5em;">Or, as Zephyr Frank, <i>et al</i>. put it: "Paid advertising can generate large numbers of clicks on a website. It cannot, however, produce good metadata or newly uploaded material that is relevant to the scholarly questions posed by academic researchers." "<a href="https://github.com/cestastanford/crowdsourcing/raw/master/files/Mellon%20White%20Paper.pdf">Crowdsourcing for Humanities Research</a>" (2016) Project White Paper. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>References</h3>
+
+<table style="width:90%">
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="1">[1]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Clay Shirky, "<a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">How Cognitive Surplus Will Change the World</a>", June 2010.</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="2">[2]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Tim Berners-Lee with Mark Fischetti, <i>Weaving the Web: The Original Design and Ultimate Destiny of the World Wide Web by its Inventor</i> (San Francisco: Harper, 1999).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="6">[3]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">"P.Oxy 5156, Plutarch Moralia 660C, 661B-C (Quaestiones Convivales IV PR., 1.2)", in <i>The Oxyrhynchus Papyri</i>, R.-L. Chang <i>et al</i>., eds, vol. 78 (London, Egypt Exploration Society, 2012), 97-98. </td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="7">[4]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Alex C. Williams <i>et al.</i>, "A Computational Pipeline for Crowdsourced Transcriptions of Ancient Greek Papyrus Fragments", in <i>IEEE International Conference on Big Data</i>, October 2014. <a href="https://doi.org/10.1109/BigData.2014.7004460">https://doi.org/10.1109/BigData.2014.7004460</a></td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="8">[5]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Richard Grayson, "A Life in the Trenches? The Use of Operation War Diary and Crowdsourcing Methods to Provide an Understanding of the British Army's Day-to-Day Life on the Western Front", <i>British Journal for Military History,</i> 2.2 (2016), 160-85.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="9">[6]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Katie Mika, "<a href="http://library.mcz.harvard.edu/blog/transcription-tools-survey-katie-mika-ndsr-resident">Transcription Tools: a survey by Katie Mika, NDSR Resident</a>", Harvard University, Ernst Mayr Library Blog.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="10">[7]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Roberta Kwok, "<a href="http://www.newyorker.com/tech/elements/crowdsourcing-for-shakespeare">Crowdsourcing For Shakespeare</a>", <i>The New Yorker</i>, 16 Jan. 2017. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>About the Authors</h3>
+
+<p class="blue"><b>Victoria Van Hyning</b> is a Junior Research Fellow at Pembroke College, and a British Academy Postdoctoral Fellow. Her current project, 'Court to Convent: Early Modern English Catholic Women's Autobiography', will reveal how Catholic women articulated selfhood in the period when it was illegal to practice Catholicism, 1535 to 1829. She is also the Humanities PI of Zooniverse.org, the world leading academic crowdsourcing organization. Her projects include <a href="https://www.sciencegossip.org">Science Gossip</a>, <a href="http://www.shakespearesworld.org">Shakespeare's World</a> and <a href="https://anno.tate.org.uk">AnnoTate</a>.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Samantha Blickhan</b> is the IMLS Postdoctoral Fellow in the Department of Citizen Science at the Adler Planetarium, working on transcription projects for the Zooniverse. She received her Ph.D. in Musicology from Royal Holloway, University of London, with a thesis on the palaeography of British song notation in the 12th and 13th centuries. Her research interests include music and perception, and their relationships with writing systems, technology and pedagogy.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Laura Trouille</b> is co-Investigator for Zooniverse and Director of Citizen Science at the Adler Planetarium where she leads the Zooniverse web development and Teen Programs teams. While earning her Ph.D. in astronomy in 2010 studying galaxy evolution, she also earned the Center for the Integration of Research, Teaching and Learning's Delta certificate for STEM education research. As a CIERA Postdoctoral Fellow at Northwestern University's CIERA Center for Astrophysics, she continued her research on active galaxies as well as co-led the Computational Thinking in STEM project, bringing computational thinking and modeling curricular materials to high school science and math teachers. </p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue">Chris Lintott is a professor of astrophysics at the University of Oxford, where he is also a research fellow at New College. He is the principle investigator for Galaxy Zoo and the Zooniverse, and his own research focuses on novel modes of crowdsourcing for anomaly detection.</p>
+
+<div class="divider-full">&nbsp;</div>
+
+ <!-- Standard Copyright line here -->
+
+<div class="center">
+<p class="footer">Copyright &reg; 2017 Victoria Van Hyning, Samantha Blickhan, Laura Trouille and Chris Lintott</p>
+</div>
+
+<div style="height:1px;background:#2b538e"></div>
+
+</div>
+</form>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_fulltext.html b/python/tests/files/first_monday_ojs3_fulltext.html
new file mode 100644
index 0000000..2248aed
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_fulltext.html
@@ -0,0 +1,441 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<title>Surveillance, stigma and sociotechnical design for HIV</title>
+</head>
+<body bgcolor="#ffffff" LINK="#bb7777" VLINK="#7777bb" ALINK="#ffee99" text="#000000">
+<blockquote><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71629" border="1" alt="First Monday" align="bottom"><br></blockquote>
+<hr>
+<blockquote>
+
+<center><a href="#author"><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71975" alt="Surveillance, stigma and sociotechnical design for HIV by Calvin Liang, Jevan Alexander Hutson, and Os Keyes" border="1"></a></center>
+
+<br><hr><br>
+
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71627" alt="Abstract"><br>Online dating and hookup platforms have fundamentally changed people&rsquo;s day-to-day practices of sex and love &mdash; but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms &ldquo;work&rdquo; for HIV frequently focus on user-to-user interactions and disclosure of one&rsquo;s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+
+<p><strong>Contents</strong></p>
+<p><a href="#p1">Introduction</a><br>
+<a href="#p2">Methods</a><br>
+<a href="#p3">Findings</a><br>
+<a href="#p4">Discussion</a><br>
+<a href="#p5">Conclusion</a></p>
+
+<p>&nbsp;</p><hr><p>&nbsp;</p>
+<p><strong><a name="p1"></a>Introduction</strong></p>
+
+<table width="70%" align="center"><tr><td>&ldquo;AIDS is essentially a crisis of governance, of what governments do and do not do, to and for their people &mdash; we have the drugs to treat HIV infection, we have the tools to confront the risks that drive HIV transmission and prevent infection itself &mdash; what we don&rsquo;t have is national political will necessary to scale-up our response. We have demanded too little from our leaders, excused far too much.&rdquo;<br>&mdash; Gregg Gonsalves, speech at the 2006 Toronto AIDS Conference.</td></tr></table>
+
+<table width="70%" align="center"><tr><td>&ldquo;Design is inherently about change &mdash; not just in the creation of new material artifacts, but in the ways that new technological objects afford new practices, social habits, and ways of living and interacting.&rdquo;<br>&mdash; Dombrowski, <em>et al.</em> (2016). &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments.&rdquo;</td></tr></table>
+
+<p>Living and loving with HIV is a complicated task. HIV status and the stigma attached to it exists within a complex interplay of social norms and medicolegal infrastructures. The medicolegal history of HIV begins the moment that HIV and AIDS emerged, constituting a mix of medically justified legal norms and legally enforced medical requirements. The criminal justice and public health systems of modern states demarcated people living with HIV as a uniquely dangerous population, &ldquo;one that needed to be sought out, tracked down, tested, reported, listed, tagged, monitored, regulated, and, increasingly, criminalized&rdquo; <a name="1a"></a>[<a href="#1">1</a>].</p>
+
+<p>The immediate policy response in the United States imposed significant criminal and civil liability upon people living with HIV (Hoppe, 2018; Harsono, <em>et al.</em>, 2017; Sykes, <em>et al.</em>, 2016; Thrasher, 2015; Galletly, <em>et al.</em>, 2014; Lehman, <em>et al.</em>, 2014; Gagnon, 2012; Pollard, 2006; Gostin, <em>et al.</em>, 1999). Between 1986&ndash;2019, HIV-specific criminal laws and sentence enhancements applicable to people living with HIV have been enacted in 34 states and two U.S. territories (Center for HIV Law &amp; Policy, 2019; Lehman, <em>et al.</em>, 2014). Since 1986, these laws have criminalized nondisclosure of HIV and engagement in &ldquo;risky&rdquo; behaviors such as sexual activity, exposure to bodily fluids, needle sharing, sex work, blood/organ/semen donation, and, in a variety of instances, behaviors posing little, if any, risk of HIV transmission (Center for Disease Control and Prevention, 2019a; Center for HIV Law &amp; Policy, 2019).</p>
+
+<p>Despite claiming medical legitimacy for this punitive approach, researchers have long understood that the criminalization of HIV transmission was instead fueled by the associations between HIV and the gay community and communities of color (Hoppe, 2018; Gallo, 2006; Johnson, 1992; Banks, 1989) at a time when consensual sex between same-sex partners was a criminal offense in twenty-two states and over 61 percent of American evangelicals and 50 percent of non-evangelicals agreed with the statement &ldquo;I sometimes think AIDS is a punishment for the decline in moral standards&rdquo; (Gallup and Castelli, 1987).</p>
+
+<p>A significant body of empirical social science work documents the harmful effects HIV laws have had on the lives of people living with HIV (Barr&eacute;Sinoussi, <em>et al.</em>, 2018; Harsono, <em>et al.</em>, 2017; Sweeney, <em>et al.</em>, 2017; Adam, <em>et al.</em>, 2014). HIV criminalization both reinforces and magnifies HIV-related stigma and discrimination, reduces the willingness of persons at risk for HIV to get tested or seek care, and imperils demographic health collection of information (Harsono, <em>et al.</em>, 2017; Burris and Cameron, 2008; Galletly and Pinkerton, 2006; Elliot, 2002). A survey of over 2,000 people living with HIV in the U.S. revealed that at least 25 percent of respondents knew one or more individuals who were afraid to get tested for fear of facing criminalization (Sero Project, 2012). HIV criminalization also ignores the reality that successful antiretroviral therapy can render the level of the virus to undetectable, which, according to the National Institute of Health, means that HIV is then untransmittable (Eisinger, <em>et al.</em>, 2019).</p>
+
+<p>While HIV transmission was criminalized, other tools of control &mdash; in the form of surveillance &mdash; arose and were enforced. Early policy responses to HIV centered on overt surveillance and ostracism of those infected and perceived to be at risk (Fortin, 1995). This surveillance generally consists of disease reporting, sexual contact tracing, and data collection of people who have been diagnosed with HIV (Fan, 2012; 2011; Ward and Bell, 2014; Ward, 2005). The Center for Disease Control, for example, collects HIV data based on confidential name-based reporting laws implemented in all 50 states as of April 2008 (Center for Disease Control and Prevention, 2019b).</p>
+
+<p>HIV surveillance (and sexually transmitted infection surveillance more broadly) centralizes information and power in the state (Fairchild, <em>et al.</em>, 2007; Fan, 2012); because HIV intervention and surveillance is generally concentrated in lower income communities and health settings (McCree and Hogben, 2010), the most socially and economically marginalized communities bear the heaviest burden of HIV surveillance and its downstream consequences (Miller, <em>et al.</em>, 2004; Banks, 1989; Brandt, 1987). There is a long-racialized history of HIV, one that, in combination with the background racism of the United States, has led to the systemic undertreatment and under-consideration of communities of color (Ford, <em>et al.</em>, 2007; Anonymous, 2000; Johnson, 1992).</p>
+
+<p>This infrastructure of surveillance in turn reinforces the stigma of HIV, which has dramatic consequences for the likelihood of unwanted disclosure, access to care, psychiatric well-being, housing and employment discrimination, and, consequently, quality (or probability) of life (Lazarus, <em>et al.</em>, 2016; Mahajan, <em>et al.</em>, 2008). Coupled with the overarching stigma of HIV and its criminalization in various contexts, HIV surveillance offers a tool through which the state can identify citizens to be punished.</p>
+
+<p>In the era of &ldquo;big data&rdquo; and ubiquitous surveillance capitalism (Zuboff, 2019) &mdash; the private monetization of information about reality &mdash; HIV surveillance is not just in the hands of the state, but also in the hands of private organizations and individuals. In the context of widespread state surveillance and control and ongoing stigmatization of HIV, this opens yet more possibilities for harm through enabling the selling and redistribution of HIV status information, without the user&rsquo;s meaningful consent, to parties who may themselves engage in discrimination or direct violence.</p>
+
+<p>Many online platforms &mdash; including, as we trace out below, dating platforms &mdash; constitute not just spaces for the purposes outlined in their marketing materials but also tools for the police in tracing HIV status and criminalized behavior. In recent years, police have used technology to conduct Internet-based investigations for a similar purpose (POZ, 2015). Police now go undercover on Web sites and dating apps by creating fake identities online (Semitsu, 2011), and local law enforcement agencies and federal agencies increasingly employ these tactics in online investigations (Lichtblau and Arkin, 2014).</p>
+
+<p>Legal and public health scholars and advocates continue to call for a paradigm shift in managing HIV that leaves behind historical responses like surveillance, ostracism, and incarceration and accounts for the rise of the Internet and mobile technology and their impact on sexual attitudes and behaviors (Lehman, <em>et al.</em>, 2014; McCallum, 2014; Fan, 2011; Fenton, 2010). Since the criminalization of HIV, intimate platforms have become vital structures through which millions of people access the opportunity to engage in reciprocal romantic and sexual relationships (Hutson, <em>et al.</em>, 2018; Taylor, <em>et al.</em>, 2017; Rosenfeld and Thomas, 2012). By designing infrastructures for intimate affiliation, intimate platforms wield unmatched structural power to shape who meets whom and how within dating and sexual platforms (Hutson, <em>et al.</em>, 2018; Levy and Barocas, 2018; Emens, 2008; Robinson, 2007). These platforms frame the circumstances within which users understand each other as prospective romantic or sexual partners and shape social norms, sexual scripts, and relative advantages among users (Hardy and Lindtner, 2017; Kannabiran, <em>et al.</em>, 2012).</p>
+
+<p>The design of intimate platforms provides opportunities to explore new ways of managing HIV that reduce the concentration of power and information in the state (Fan, 2012). Through the role that platform design plays in shaping cultural norms, which has been identified as a more effective way of achieving HIV transmission prevention than flexing the punitive and surveillant arms of the state (Sunstein, 1996), intimate platform design provides opportunities to explore new ways of managing HIV (Fan, 2012). Indeed, a meta-analysis of HIV prevention efforts found that strategies that intervene in social meaning by shaping social norms, cultural practices, and individual attitudes were more effective in empowering behavioral change than appeals to fear (Albarracin, <em>et al.</em>, 2015).</p>
+
+<p>However, designing intimate platforms to account for HIV also presents serious challenges for social computing researchers and human-computer interaction (HCI) designers. As Handel and Shklovski pointed out: &ldquo;The minutiae of design decisions around profile options deserves particular attention because even the smallest changes can result in substantial differences for user interactions&rdquo; (Handel and Shklovski, 2012). In addition to concerns around how to best design for HIV, platforms, Grindr in particular, have already come under fire for sharing user HIV information with third parties (Singer, 2018). Moreover, designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the serious risk of re-entrenching the status quo and its incumbent inequalities and power relations (Bardzell, 2010). While designing for HIV presents opportunities to redress stigma and harm, researchers in HCI must understand that &ldquo;[i]t is not enough to have good intentions ... [we] must ground [our] efforts in clear political commitments and rigorous evaluations of the likely consequences&rdquo; (Green, 2018).</p>
+
+<p>From this comes the recognition that social computing designers and researchers seeking to design for disclosure cannot afford to ignore the ways that the lived experiences of people living with HIV are shaped by structural forces and, particularly, the reality of HIV criminalization and the State&rsquo;s role in conducting STD surveillance. Platforms, after all, do not exist in a separate sphere from material reality: a redesign that eases HIV disclosure from user-to-user might also involve the storing of disclosure data by the platform &mdash; data that can then be accessed, requisitioned, and co-opted by arms of the state. In line with Jackson, <em>et al.&rsquo;s</em> call for the social computing community to address the structural and lived consequences of law and policy that &ldquo;establish the very terrain on which design and practice can be conceived, articulated, and imagined &mdash; and upon which battles of accountability are inevitably waged&rdquo; <a name="2a"></a>[<a href="#2">2</a>], we wish to undertake a critical investigation of HIV disclosure in dating and hookup platforms. This involves not just investigating the implications of disclosure in a person-to-person sense, but also how platform design is shaped by legal and administrative regulation and how the risks of disclosure might open users up to systems of surveillance, stigma, and criminalization. We do so by using a range of platforms in an effort to gain a wide view, and to practice prefigurative politics &mdash; minimizing our assumptions about the &ldquo;type&rdquo; of people at risk of HIV infection and/or surveillance.</p>
+
+<p>To do this, we analyze platform&rsquo;s consequences for HIV through the lens of user-to-user interactions, exploring the ways that design renders users visible and vulnerable to wider carceral and surveillance infrastructures, and the way that design shapes (and is shaped) by HIV&rsquo;s legal status. We ground our discussion in a content analysis of 50 popular, mobile dating and hookup platforms, coding for design and policy choices related to HIV disclosure, prevention, destigmatization, surveillance, privacy, and criminalization. Through this, we reveal that many platforms fail to account for HIV, and of those that do, many neglect to attend to the downstream consequences of HIV disclosure and the data produced by it, while exacerbating the social, racial, and class stereotypes associated with the condition.</p>
+
+<p>As scholars and designers consider how platform design might aid HIV prevention and destigmatization (Hutson, <em>et al.</em>, 2018; Albury, <em>et al.</em>, 2017; Wohlfeiler, <em>et al.</em>, 2013; Rosser, <em>et al.</em>, 2011), we aim to grapple with the structural and ethical implications of designing for HIV, particularly how intimate platform design might aid and abet the decriminalization and surveillance of HIV (Sykes, <em>et al.</em>, 2016; Kazatchkine, <em>et al.</em>, 2015; Perone, 2013; Gagnon, 2012; J&uuml;rgens, <em>et al.</em>, 2009). Drawing on principles from social justice-oriented design to investigate controversies and design possibilities in intimate platforms, we attempt to articulate an approach to intimate platform design that not only works to reduce the stigma of user disclosure, but also works to contest historic and present power imbalances and injustices between users, platforms, and the state.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p2"></a>Methods</strong></p>
+
+<p>Using a directed content analysis (Hsieh and Shannon, 2005), we reviewed 50 existing mobile dating and hookup platforms. Content analyses have proven effective in understanding platform design and governance and the ways design practices mediate user-to-user bias and discrimination (Levy and Barocas, 2018; Hutson, <em>et al.</em>, 2018). We set out to capture a landscape of popular platforms and selected the first 50 dating and hook up platforms in the top 200 grossing social networking applications in the United States on the iOS App Store in March of 2019. <a href="#fig1">Figure 1</a> lists the platforms selected in alphabetical order.</p>
+
+<p>&nbsp;</p>
+<a name="fig1"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71623" alt="50 dating and hookup platforms surveyed"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 1:</strong> The 50 dating and hookup platforms surveyed.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>Utilizing the walkthrough method (Light, <em>et al.</em>, 2018), we explored each platform&rsquo;s HIV-related user experience. We examined design features on each of these platforms, systematically documenting design choices, policies, and informational interventions that mediate HIV. Building upon previous work around intimate platforms and HIV, we coded each of the 50 intimate platforms based on the following dimensions:</p>
+
+<table width="70%" align="center"><tr><td><p>Prevention</p>
+<ul><li>Whether the app allows same-sex connections</li>
+<li>Whether a user can disclose HIV/sexually transmitted infection (STI) status (Warner, <em>et al.</em>, 2018)</li>
+<li>If they can disclose, what are the options? (Warner, <em>et al.</em>, 2018)</li>
+<li>Whether a user can search for or filter out users with HIV/STIs? (Hutson, <em>et al.</em>, 2018)</li>
+<li>Whether the platforms provide informational interventions with respect to HIV/STI prevention (Wang, <em>et al.</em>, 2019)</li></ul>
+<p>Stigma reduction</p>
+<ul><li>Whether a user can identify as having HIV/STI (<em>e.g.</em>, &ldquo;Poz&rdquo;, etc.)</li>
+<li>Whether a user can indicate interest in or acceptance of people living with HIV/STIs (<em>e.g.</em> outward presentation, separate from filtering, not simply via profile text) (Hutson, <em>et al.</em>, 2018)</li></ul>
+<p>Policies</p>
+<ul><li>Whether the platform engages HIV/STIs in their policies (terms of service, privacy, and community policies, etc.) (Jackson, <em>et al.</em>, 2014)</li></ul></td></tr></table>
+
+<p>For ethical reasons, we did not interact with other users, only observed features, and deleted our accounts once data were collected when possible (not all platforms allowed for account deletion). The design and policy choices described and discussed below are not intended as an endorsement of any particular design intervention for managing HIV. Rather, we aim to capture the various ways intimate platforms currently manage and mediate HIV among users and how those choices map onto extant legal and surveillant infrastructures. Additionally, we highlight two limitations in how we chose which platforms to analyze. First, it is possible for a hook-up platform to not have an accompanying mobile app, meaning our selection of platforms from the iOS app store will have invariably missed Web site-based platforms. Second, we may have overlooked platforms that are more niche or community-specific, yet not as popular in the broader platform marketplace (<em>i.e.</em>, not within the top grossing platforms).</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p3"></a>Findings</strong></p>
+
+<p>&nbsp;</p>
+<a name="fig2"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71624" alt="A visualization of our content analysis"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 2:</strong> A visualization of our content analysis.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Design features</strong></em></p>
+
+<p>Out of the 50 intimate platforms we examined, 13 were meant specifically for queer communities (11 specifically targeted at gay and bisexual men and two at lesbian and bisexual women). None of the platforms we reviewed were distinctly designed for trans people. The remaining 34 platforms were for general audiences, catering to heterosexual and homosexual connections, and three platforms were exclusively for heterosexual connections (eHarmony, Uniform Dating, and Waplog). Only queer-specific platforms (six) had explicit HIV disclosure options and allowed for filtering or searching based on HIV status. <a href="#fig3">Figure 3</a> shows the disclosure options for each platform. Growlr, Taimi, and Scruff allowed users to indicate that they were accepting of people living with HIV. Grindr, Hornet, Mr. X, Xtremboy, and Scruff, five platforms all of which are queer-specific, provide informational interventions with respect to HIV/STI prevention (See <a href="#fig4">Figure 4</a> for examples). Eight dating apps mentioned HIV in their policies (five queer-specific, three general). Four dating apps allowed users to identify with an HIV/STI-relevant identity category, often labeled &ldquo;poz&rdquo;. Please see <a href="#fig2">Figure 2</a> for a visualization of our content analysis.</p>
+
+<p>&nbsp;</p>
+<a name="fig3"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71625" alt="Disclosure options"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 3:</strong> Disclosure options.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>&nbsp;</p>
+<a name="fig4"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71626" alt="Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right)"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 4:</strong> Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right).</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Policies</strong></em></p>
+
+<p>None of the 50 intimate platforms we reviewed explicitly mention HIV in their terms of service. Four platforms expressly discuss HIV in their privacy policies (Grindr, Hornet, Scruff, and Mr. X), and four platforms mention HIV in platform safety policies (Planet Romeo, Tinder, BlackPeopleMeet, and Our Time). No platform engaged any of the legal implications of HIV. No platform engaged the public health surveillance of HIV.</p>
+
+<p>Of the four platforms that expressly engage HIV in their privacy policies (Grindr, Hornet, Mr. X, Scruff), only two (Grindr &amp; Hornet) explicitly prohibit sharing HIV information with third parties. By disclosing one&rsquo;s HIV status on Mr. X and Scruff, users consent to the platform&rsquo;s processing of that information. Grindr warns that HIV status disclosure on a user profile is effectively public information, however the platform does not share HIV status information with third party tracking, analytics, and advertising companies or service providers. Of all the platforms reviewed, Grindr&rsquo;s privacy policy is the only one that devotes an entire section to HIV status, which is not particularly surprising given Grindr&rsquo;s involvement in multiple controversies around sharing HIV information with third parties (Fitzsimons, 2019; Singer, 2018):</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;HIV Status. At the recommendation of HIV prevention experts and the community of Grindr users, we give you the option of publishing your health characteristics, such as your HIV status, in your Grindr community profile. Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App. As a result, you should carefully consider whether you want to disclose your HIV status. We do not share HIV status with any third-party service advertisers or third-party service providers other than companies that host data on our behalf (<em>e.g.</em>, Amazon Cloud). In addition, we do not use HIV status for advertising purposes and do not share this information with advertisers.&rdquo;</td></tr></table>
+
+<p> According to Hornet&rsquo;s privacy policies, they &ldquo;[do] not share any HIV status information with third parties unless required to do so by law&rdquo;. Of the 50 platforms reviewed, Hornet was the only one to enable users to opt into receiving &ldquo;in-app reminders to undergo HIV tests and receive information on the location of nearby testing centers.&rdquo; On Hornet, a user&rsquo;s HIV status &ldquo;is only searchable by users who have defined themselves as HIV positive.&rdquo; Scruff&rsquo;s privacy policy highlights that &ldquo;there is no requirement to&rdquo; provide them with &ldquo;health details and whether part of the POZ (HIV positive) community (for example, in creating or updating your profile),&rdquo; and that by doing so, users &ldquo;are explicitly consenting to [Scruff&rsquo;s] processing of [their] information.&rdquo; Mr. X&rsquo;s privacy policy notes that HIV status information &ldquo;may be considered &lsquo;special&rsquo; or &lsquo;sensitive&rsquo; in certain jurisdictions,&rdquo; and that by providing this information, users &ldquo;consent to [Mr. X&rsquo;s] processing that information&rdquo;.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p4"></a>Discussion</strong></p>
+
+<p><em><strong>Prevention</strong></em></p>
+
+<p>Platforms can act as an interventional tool to improve access to and perceptions of care for people living with HIV. Examples of HIV/STI prevention include a &ldquo;Last Tested Date&rdquo; section on a user&rsquo;s profile and reminders to get tested for HIV/STIs. Some current platforms engage with HIV more critically by acknowledging that HIV is an issue its users should be aware through specific features. Hornet, for instance, provides its users with HIV-relevant educational material and resources for getting tested. Hornet also limits searching based on HIV status to people who themselves have chosen the HIV positive option, thereby limiting the possibility of HIV status-based discrimination. Hornet and Grindr can also provide reminders for users to get tested. Scruff allows users to choose from sex safety practices that include using condoms, using pre-exposure prophylaxis (PrEP), and/or treatment as prevention (Warner, <em>et al.</em>, 2019).</p>
+
+<p>Due in large part to the history of HIV&rsquo;s recognition as a medical condition, HIV has been generally classified as a &ldquo;gay man&rsquo;s problem&rdquo; in North America &mdash; frequently (albeit almost as frequently unmarked) a white, cisgender gay man&rsquo;s problem. This classification and framing acted to both separate normative society from the stigma associated with the condition and provide an avenue for activism by associating it with the most &ldquo;acceptable&rdquo; queer bodies: masculine, middle class, cisgender and white (Epstein, 1996).</p>
+
+<p>HIV has disproportionately impacted gay communities specifically, but transmission does not fit a neat pattern of being binarized tidily along sexuality. It is disproportionately prevalent in communities of color, appears in heterosexual relationships and lives, and risk of transmission follows societal vulnerability and marginalization &mdash; transgender women, particularly transgender women of color, are particularly overrepresented in diagnosis rates (Clark, <em>et al.</em>, 2017). While the partial normalization of HIV &mdash; holding it outside the direct concerns of white, cisgender, heterosexual people, but embodying it in people who look &ldquo;just like them&rdquo; &mdash; may have aided in assembling efforts to address the condition, the assumptions that it has created in who is at risk and who &ldquo;counts&rdquo; have been tremendous. One only has to look at the ethnographic work of Vivianne Namaste, who highlights how Montreal&rsquo;s history of HIV, its recognition, and efforts at its prevention simultaneously elided the incidence rate amongst the Haitian community (which at one point had 65 percent of reported AIDS cases) and lacked any advice or conception of susceptibility for women, particularly heterosexual or bisexual women (Namaste, 2015).</p>
+
+<p>Our platform analysis demonstrates that these same assumptions about vulnerability and risk are present in the design of intimate platforms. Generic platforms (<em>i.e.</em>, those that cater to non-queer or broader, more heteronormative audiences) entirely do not consider, engage, or design for HIV while the platforms for queer &mdash; and more specifically gay men &mdash; do. Even within the group of 13 queer-specific applications, neither of the two queer women-specific apps allowed for HIV disclosure, even though 23 percent of people with HIV in the U.S. are women (Center for Disease Control and Prevention, 2019c). Most, if not all, platforms dedicated to general audiences do nothing when it comes to HIV prevention, contributing to the knowledge gap for general audiences on sexual health, HIV-specific, and more. Because general audiences can go through online dating experiences without encountering HIV materials, platform designers allow these users to falsely believe that their sexual lives are excluded from important matters of sexual health.</p>
+
+<p>Our intent is not to suggest that HIV should be narrated as a problem for everyone; to ignore sexuality in the impact and risk of HIV transmission is an ahistorical mistake. But treating it <em>solely</em> as a &ldquo;gay man&rsquo;s problem&rdquo; simultaneously elides differences in vulnerability and risk within gay communities and perpetuates the silence around transmission for other populations, particularly trans women of color and/or heterosexual people. In other words, it is not that HIV is not frequently a risk for gay communities, but that drawing a line between sexuality and risk perpetuates the more nuanced disparities in risk and the discourse that HIV transmission is not something anyone else has to think about.</p>
+
+<p>Platforms can and have implemented prevention efforts through Last Tested Date and Testing Reminders features. Doing so more ubiquitously, rather than solely on gay male-specific platforms, may be helpful in normalizing prevention efforts like getting tested regularly and knowing one&rsquo;s status. Through opportunities like this, platform designers have the opportunity to promote HIV/STI prevention and care &mdash; an opportunity that is valuable precisely for its ability to normalize prevention efforts. This is not to say that such features are not without risks, particularly with regards to state surveillance, intervention and structural forces, which is our next topic of concern and discussion.</p>
+
+<p><em><strong>Stigma &amp; disclosure</strong></em></p>
+
+<p>Designing for HIV is not as simple as including disclosure fields and status-based filtering or not. Allowing disclosure and filtering can protect people living with HIV from negative and sometimes harmful interactions, help filter out people who might discriminate against them, fight HIV stigma, and promote much-needed awareness. However, disclosure and filtering can also lead to discriminatory practices (Hutson, <em>et al.</em>, 2018), have potential for privacy unraveling (Warner, <em>et al.</em>, 2018), and contribute to surveillance (Fan, 2012, 2011).</p>
+
+<p>De-stigmatizing HIV offers designers an opportunity to engage in the structural dimensions of how HIV operates in social life and can possibly allow us to better tap into social norms around the condition that ultimately improve other outcomes. For instance, humanizing people living with HIV could lead to more people getting tested, being open about their status, and being communicative with their sexual partners. Platforms have the power to shift social norms and destigmatize HIV at scale due to their pervasiveness throughout modern connections, but designers need to contest the ethical implications of de-stigmatizing HIV on these platforms, especially through current features such as HIV-status-based filtering and disclosure options.</p>
+
+<p>Filtering and searching tools based on HIV status can be instrumental for people living with HIV to find others who are either seropositive or otherwise accepting of seropositive people. Additionally, filtering out those who might discriminate against them for their HIV status anyways allows people living with HIV to avoid awkward or even violent interactions with users who harbor problematic beliefs about people living with HIV. Conversely, HIV status-based filtering and searching tools have representational and allocational harms. First, it represents that there are particularly psycho-social characteristics incumbent with HIV status. These stereotypes play out in a variety of different ways such as the framing that people living with HIV engage in &ldquo;risky&rdquo; sexual behavior. Second, HIV status-based filtering can be used to structurally exclude HIV positive users from the opportunity to engage in intimate affiliation (Hutson, <em>et al.</em>, 2018). Platforms can and do provide users the ability to screen out other users who identify as &ldquo;Poz&rdquo; or disclose their HIV status. Not only do these design features facilitate exclusion, they may disincentivize HIV related disclosures to the extent that such disclosures can be weaponized by other users to exclude them as potential intimate affiliates.</p>
+
+<p>Disclosure fields as a way to de-stigmatize HIV are similarly complicated in that they can inhibit and benefit people living with HIV. For one, encouraging users to disclose, regardless of their status, can create a healthier culture and discussion around HIV, possibly making talking about one&rsquo;s status an acceptable and common practice of intimate engagement. On the other hand, disclosure can be used for a variety of problematic ends that harm seropositive users. Other users may discriminate against users who have disclosed their HIV status, choosing to ignore or disengage with them entirely. Disclosure may have unintended consequences and lead to more personal and violent outcomes. Due to laws in particular jurisdictions, failure to disclose one&rsquo;s status to a partner can lead to prosecution and potentially incarceration. People living with HIV might also face physical and emotional threats for disclosing their status either publicly or privately.</p>
+
+<p>Due to these complexities, designers of dating platforms must face the question of how can we de-stigmatize HIV without creating additional obstacles for people living with HIV? Platforms need to critically unpack the possible consequences of well-intentioned design choices, including HIV status-based filtering and HIV status disclosure fields. Of the platforms we reviewed, Scruff is the only one to provide for HIV disclosure without using an express &ldquo;HIV status&rdquo; field, allowing instead two disclosure options, Poz and Treatment as Prevention. &ldquo;Poz&rdquo; constitutes an association and identification with a community (<em>e.g.</em>, &ldquo;I am a bear, daddy, poz&rdquo;), while &ldquo;Treatment as Prevention,&rdquo; signals antiretroviral therapy (<em>i.e.</em>, use of HIV medicines to treat HIV infection) and constitutes a link to sex safety practices.</p>
+
+<p><em><strong>Surveillance &amp; criminalization</strong></em></p>
+
+<p>At the same time, given the questions of structural power and surveillance built into these platforms, we are leery of treating disclosure option design as the site of de-stigmatization and justice. Questions of privacy and stigma go wider than micro-interactions and touch on how HIV is seen and responded to societally and administratively. The dominant responses to HIV/AIDS &ldquo;center on adjusting the traditional levers of criminal and tort law, and of public health law, with its surveillance and disciplinary regimes that concentrate information and decision-making in the state&rdquo; <a name="3a"></a>[<a href="#3">3</a>]. Indeed, HIV continues to function as a &ldquo;vector for the exercise of state power and the invention of novel logics and techniques of government,&rdquo; whereby &ldquo;[i]nfection with HIV virtually guarantees that a citizen will need to interact, either beneficently or coercively, with one or more state bureaucracies&rdquo; <a name="4a"></a>[<a href="#4">4</a>].</p>
+
+<p>The broader ecosystem of intimate platforms that we observed provided virtually no HIV-specific privacy information or protections for users living with HIV. Overall, both the platforms that account for HIV in their privacy policies and the platforms that enable disclosure but do not account for HIV in their privacy policies continue to place the risks and burden of surveillance, privacy, and disclosure on users with HIV. Grindr&rsquo;s &ldquo;HIV Status&rdquo; policy puts it clearly: &ldquo;Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App.&rdquo; By surfacing this as a risk we do not mean to suggest that users lack agency &mdash; merely that the agency to choose between a range of options can be influenced by how those options are bounded and made available in addition to the affordances and norms that platform design provides. That a user makes information public does not mean that &ldquo;consumable by all&rdquo; is the framework of disclosure that they have in mind (Wittkower, 2016).</p>
+
+<p>While some intimate platforms are working towards promoting HIV disclosure, prevention, and de-stigmatization, they are also failing to grapple with privacy implications of HIV and their responsibility in ensuring it. People living with HIV are already vulnerable and bear the weight of HIV disclosure&rsquo;s downstream consequences. By continuing to offload the burdens and risk on those with HIV, platforms are likely contributing to issues of nondisclosure as well as HIV testing. Research shows that privacy fears can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002).</p>
+
+<p>In this context, proposals to design for HIV disclosure that do not consider the wider structural implications of surveillance are concerning. The focus of most research into HIV and online dating in HCI on micro-interactions and enabling trust and certainty between users elides the implications that providing this data to a platform outside user control has and the way that this data can be used to control. This is not an abstract risk; just this year, Grindr (the platform under study) has been the subject of scrutiny by the U.S. government over its Chinese ownership, due to fears that the Chinese government might access and copy Grindr&rsquo;s data around HIV disclosure for the purpose of domestic policing and control (Fitzsimons, 2019). If we are designing to enable HIV disclosure, are we working to improve stigma associated with disclosure &mdash; or are we enabling new forms of control and surveillance?</p>
+
+<p>In the United States today, intimate platforms operate within 29 states that have HIV criminal laws, which include laws that target sex/nondisclosure of HIV-positive status, sex work, exposure to bodily fluids, needle-sharing, sex work, and blood/organ/semen donation, nine states that have sentencing enhancements applicable to people living with HIV who commit an underlying assault crime, and 24 states that have prosecuted people living with HIV under non-HIV-specific general criminal laws (Center for HIV Law &amp; Policy, 2019). Here, the design of intimate platforms cannot be removed from the reality of laws that criminalize HIV, particularly HIV non-disclosure.</p>
+
+<p>People living with HIV in the U.S. with HIV-specific criminal laws must disclose their HIV status to sexual partners. Generally, &ldquo;disclosure and consent&rdquo; is an affirmative defense <a name="5a"></a>[<a href="#5">5</a>], whereby a person can avoid criminal and civil liability if they disclose their serostatus <a name="6a"></a>[<a href="#6">6</a>] and their sexual partner voluntarily consents to sexual activity with knowledge of that serostatus <a name="7a"></a>[<a href="#7">7</a>]. Many of the laws that criminalize HIV non-disclosure do not provide guidance as to what methods of disclosure and consent are enough to avoid prosecution and conviction (McCallum, 2014). No court or legislature has affirmatively stated whether verbal disclosure and consent are necessary under criminal HIV transmission statutes. Furthermore, non-verbal communication online create uncertainty as to whether there is sufficient disclosure and consent to remove criminal liability for HIV-positive individuals. Both disclosure and consent can be ambiguous or misunderstood, a problem that is complicated by the design and widespread use of mobile dating and hookup platforms.</p>
+
+<p>It remains unclear what constitutes appropriate disclosure and informed consent in the context of intimate platforms, such as HIV disclosure fields on user profiles or other communication in a profile&rsquo;s free form text sections (<em>e.g.</em>, &ldquo;+&rdquo; &ldquo;Poz&rdquo;, &ldquo;undetectable&rdquo;). Although some intimate platforms afford HIV-positive users the ability to disclose their serostatus in new ways, no court or legislature in the U.S. has answered whether disclosing HIV status on an intimate platform is enough to achieve informed consent and avoid criminal and civil liability. Yet many people living with HIV also use records of conversations on intimate platforms as a means of protection. For example, people disclose their status and use that record as a way to protect themselves from future allegations of non-disclosure. This ambiguity and incumbent legal risk places significant responsibility and pressure on HIV users. Research shows that fears around rejection, self-blame, criminalization, and privacy can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002). Privacy concerns around HIV disclosure are often associated with the need to protect one&rsquo;s self from HIV related stigma (Adam, <em>et al.</em>, 2011; Serovich and Mosack, 2006; Greene, <em>et al.</em>, 2003). As more and more people use platforms to meet intimate partners, the historical failure of HIV criminalization law to understand how disclosure and consent are negotiated in practice becomes all the more apparent.</p>
+
+<p>It might seem from this that designers and developers are trapped in an impossible situation &mdash; disclosure to protect users simultaneously produces the possibility of structural harms for those disclosing. While we urge designers to take both needs seriously, we do not consider it impossible; in fact, there is a range of work within queer theory and technology that not only articulates this tension of privacy, disclosure and the reuse of data but suggests queer forms of resistance to it. Writing more broadly, Brian Schram highlights the way that the increasing possibilities of &ldquo;big data&rdquo; and its attendant surveillance structures &ldquo;constitute an undoing of Queerness as a radical political injection&rdquo; <a name="8a"></a>[<a href="#8">8</a>], advocating a politics of <em>melancholia</em> that features a haunting of archives: an insertion of the dead weight of our collective memory as Queer persons into the growing catalog of our digital information. In other words, Schram suggests the deliberate incorporation of masses of false data, profiles, and traces into data stores in order to render ambiguous the truth of any presence and provide cover for those queer persons existing within the platform(s) data highlights. What would this look like in the case of dating platforms? What are the possibilities raised by incorporating a deluge of false accounts, <em>doppelg&auml;ngers</em>, and doubles, not as a deception of the platform or its users, but against state forces examining the database?</p>
+
+<p>More broadly, we might see possibilities for the future through practices in the past. In how queer communities responded to HIV disclosure and protection protocols during the 1980s and 1990s, David Halperin has articulated the way that gay communities worked to articulate norms that balanced risks, trust, and vulnerability in the absence of structural norms, that &ldquo;it is gay men themselves who have continued to define, and to redefine, the limits of safety through an ongoing history of sexual experimentation and mutual consultation, and who have thereby produced, over time, workable compromises and pragmatic solutions that balance safety and risk&rdquo; <a name="9a"></a>[<a href="#9">9</a>]. Rather than taking universalized, top-down approaches to platform design for all, we might instead seek to work up and to create a diverse range of spaces that challenge the ease of surveillance built into large-scale platforms and afford individual users more agency in establishing those compromises and solutions and engaging in that consultation.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p5"></a>Conclusion</strong></p>
+
+<p>As HCI researchers and designers, we continue to push the boundaries of what is technologically possible but doing so requires us to first ask whether platform design is even an appropriate intervention in a given situation (Keyes, <em>et al.</em>, 2019; Baumer and Silberman, 2011; Suchman, 2011). The current model of platform design for HIV cannot continue, as it is too closely tied to the collection and commodification of highly sensitive personal data. However, reimagining intimate platform design provides the social computing community an opportunity to intervene in the social norms around HIV and HIV disclosure in a manner that could unburden the weight of criminalization without centralizing the surveillant arms of the state.</p>
+
+<p>We envision a future of dating platforms that does not force people living with HIV to sacrifice surveillance for intimate experiences. Because of their entanglements with sex and romance, intimate platforms need to take on more responsibility in the sexual health and data privacy of their users. Drawing from our analysis and our own lived experiences, we recommend platform-level changes, changes in platform, and mechanisms to prevent platforms from knowing their users&rsquo; statuses. First, platforms should make explicit to their users the consequences of storing sensitive, personal information like HIV status and their documentation processes. Next, they should also implement policies that manage how data are stored when users delete their accounts and protect these data from third-party consumers. Finally, ownership of user&rsquo;s data should belong to the users themselves, rather than the platforms. Users should be able to pass along their information to other users without the platforms tracking it.</p>
+
+<p>HIV is a medical condition, but its eradication requires not just technical, or even sociotechnical, but socio<em>political</em> solutions. Indeed, the ways in which designers and policy-makers frame HIV is an inherently political decision, one that will impose the contours and boundaries of our response. The social computing community cannot do nothing, but it also must resist the desire to do everything. Designing user interfaces and platform policies to account for HIV will require a rigorous analysis of possible outcomes and consequences as well as a bedrock commitment to centering the voices and experiences of those impacted by HIV and the state&rsquo;s responses to it. Our commitments must account for the ways pathology and power intertwine to subjugate and otherize impacted communities at home and abroad.</p>
+
+<p>Designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the risk of re-entrenching the status quo and its incumbent inequalities and power relations (Dombrowski, <em>et al.</em>, 2016; Light, 2011; Irani, <em>et al.</em>, 2010; Bardzell, 2010). The social computing community must ground its efforts to design for HIV in clear political commitments to decriminalizing HIV and decentralizing power and information from the state. We must strive to unburden the weight of surveillance and incarceration on vulnerable and marginalized communities and work towards offloading the significant social and legal risks and pressures for people living with HIV. Moreover, our commitment to designing for HIV must not exclude nor obfuscate our capacity for direct action within and outside of the realms of design and research. This means fighting for the rights, dignity, and safety of people living with HIV in the streets and in the halls of local, national, and international political, legislative, and executive bodies.</p>
+
+<p>Our instinctual response to the failed and violent efforts of HIV criminalization and surveillance should not be &ldquo;there&rsquo;s an app for that,&rdquo; but rather &ldquo;there&rsquo;s a zap for that!&rdquo;. That is, the practice of designing for people with HIV should be a &ldquo;critical technical practice&rdquo; (Agre, 1997), undertaken with a mindset that sits uneasily between and is cognizant of both individual and structural power and consequence. Pioneered by the American gay liberation movement, a zap or &ldquo;zap action&rdquo; is a political action of direct and persistent public confrontation. Whether shouting down public figures or smashing pies into the faces of evangelicals, zaps aim to disrupt and disturb persons and institutions of authority to effect change (Cohen, 2018). In the words of AIDS Coalition to Unleash Power&rsquo;s (ACT UP) &ldquo;New Member Packet&rdquo;:</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;Zaps are a method for ACT UP members to register their disapproval of and anger toward the zap target. Zaps usually have more specific targets than actions. Because of this focus, numerous zapping techniques have been developed. ACT UP zaps individuals or organizations by: sending postcards or letters; invading offices and distributing fact sheets; sending (lots and lots of) faxes; picketing; outraged (and sometimes outrageous) phone calls. The more zappers who zap the zappee the better the zap.&rdquo;</td></tr></table>
+
+<p>A critical approach to designing for HIV requires the contesting of histories of incarceration, stigmatization, and surveillance and the ways in which the state exerts power and domination through its medicolegal levers of criminal law and public health surveillance. Intimate platform design should not only work to reduce the prevalence and stigma of HIV, but also to contest historic and present power imbalances and injustices between users, platforms, and the state. <img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71628" alt="End of article"></p>
+
+<p>&nbsp;</p>
+<a name="author"></a>
+<p><strong>About the authors</strong></p>
+
+<p><strong>Calvin Liang</strong> is a Ph.D. student in Human-Centered Design and Engineering Department at the University of Washington. His research broadly focuses on technology&rsquo;s role in and out of queerness, health, and queer health.<br>E-mail: cliang02 [at] uw [dot] edu</p>
+
+<p><strong>Jevan Alexander Hutson</strong>, living with HIV for four years, is a technology policy advocate, human-computer interaction researcher, and J.D. candidate at the University of Washington School of Law. His research interests center on issues of technology, law, and social life, with a particular focus on intimate/sexual computing.<br>E-mail: jevanh [at] uw [dot] edu</p>
+
+<p><strong>Os Keyes</strong> is a Ph.D. student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.<br>E-mail: okeyes [at] uw [dot] edu</p>
+
+<p>&nbsp;</p>
+<p><strong>Acknowledgements</strong></p>
+
+<p>We dedicate this paper to the radical history of the AIDS Coalition to Unleash Power (ACT UP) and to all of the souls we&rsquo;ve lost and continue to lose to HIV/AIDS. We would like to thank Mary Fan, Sean Munson, and Julie Kientz for valuable conversations and feedback, and Margret Wander and Margaret Hopkins for their ongoing care and support. This research was partially funded by a Microsoft Ada Lovelace Fellowship.</p>
+
+<p>&nbsp;</p>
+<p><strong>Notes</strong></p>
+
+<p><a name="1"></a><a href="#1a">1.</a> Halperin and Hoppe, 2017, p. 349.</p>
+
+<p><a name="2"></a><a href="#2a">2.</a> Jackson, <em>et al.</em>, 2014, p. 596.</p>
+
+<p><a name="3"></a><a href="#3a">3.</a> Fan, 2011, p. 36.</p>
+
+<p><a name="4"></a><a href="#4a">4.</a> Halperin and Hoppe, 2017, p. 255.</p>
+
+<p><a name="5"></a><a href="#5a">5.</a> See FLA. STAT. ANN. &sect; 775.0877 (2017) (&ldquo;[I]t is an affirmative defense to a charge of violating this section that the person exposed knew that the offender was infected with HIV, knew that the action being taken could result in transmission of the HIV infection, and consented to the action voluntarily with that knowledge.&rdquo;). See also <a href="http://www.hivlawandpolicy.org/states/florida">http://www.hivlawandpolicy.org/states/florida</a>.</p>
+
+<p><a name="6"></a><a href="#6a">6.</a> Serostatus is defined as: &ldquo;The state of either having or not having detectable antibodies against a specific antigen, as measured by a blood test (serologic test). For example, HIV seropositive means that a person has detectable antibodies to HIV; seronegative means that a person does not have detectable HIV antibodies.&rdquo; U.S. Department of Health &amp; Human Services, Education Materials, AIDSINFO, at <a href="https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus" target="_blank">https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus</a>, accessed 30 August 2019.</p>
+
+<p><a name="7"></a><a href="#7a">7.</a> Lehman, <em>et al.</em>, 2014, p. 1,101.</p>
+
+<p><a name="8"></a><a href="#8a">8.</a> Schram, 2019, p. 611.</p>
+
+<p><a name="9"></a><a href="#9a">9.</a> Halperin, 2015, p. 207.</p>
+
+<p>&nbsp;</p>
+<p><strong>References</strong></p>
+
+<p>Barry D. Adam, Richard Elliott, Patrice Corriveau, and Ken English, 2014. &ldquo;Impacts of criminalization on the everyday lives of people living with HIV in Canada,&rdquo; <em>Sexuality Research and Social Policy</em>, volume 11, number 1, pp. 39&ndash;49.<br>doi: <a href="https://doi.org/10.1007/s13178-013-0131-8" target="_blank">https://doi.org/10.1007/s13178-013-0131-8</a>, accessed 5 September 2020.</p>
+
+<p>Barry D. Adam, James Murray, Suzanne Ross, Jason Oliver, Stephen G. Lincoln, and Vicki Rynard, 2011. &ldquo;Hivstigma.com, an innovative Web-supported stigma reduction intervention for gay and bisexual men,&rdquo; <em>Health Education Research</em>, volume 26, number 5. pp. 795&ndash;807.<br>doi: <a href="https://doi.org/10.1093/her/cyq078" target="_blank">https://doi.org/10.1093/her/cyq078</a>, accessed 5 September 2020.</p>
+
+<p>Philip E. Agre, 1997. &ldquo;Toward a critical technical practice: Lessons learned in trying to reform AI,&rdquo; In: Geof Bowker, Les Gasser, Leigh Star, and Bill Turner (editors). <em>Bridging the great divide: Social science, technical systems, and cooperative work</em>. Mahwah, N.J.: Erlbaum.</p>
+
+<p>Anonymous, 2000. &ldquo;Name brands: The effects of intrusive HIV legislation on high-risk demographic groups,&rdquo; <em>Harvard Law Review</em>, volume 113, number 8, pp. 2,098&ndash;2,113.<br>doi: <a href="https://doi.org/10.2307/1342321" target="_blank">https://doi.org/10.2307/1342321</a>, accessed 5 September 2020.</p>
+
+<p>Taunya Lovell Banks, 1989. &ldquo;Women and AIDS &mdash; Racism, sexism, and classism,&rdquo; <em>New York University Review of Law &amp; Social Change</em>, volume 17, pp. 351&ndash;385, and at <a href="ttps://digitalcommons.law.umaryland.edu/fac_pubs/328" target="_blank">ttps://digitalcommons.law.umaryland.edu/fac_pubs/328</a>, accessed 5 September 2020.</p>
+
+<p>Shaowen Bardzell, 2010. &ldquo;Feminist HCI: Taking stock and outlining an agenda for design,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,301&ndash;1,310.<br>doi: <a href="https://doi.org/10.1145/1753326.1753521" target="_blank">https://doi.org/10.1145/1753326.1753521</a>, accessed 5 September 2020.</p>
+
+<p>Fran&ccedil;oise Barr&eacute;Sinoussi, Salim S. Abdool Karim, Jan Albert, LindaGail Bekker, Chris Beyrer, Pedro Cahn, Alexandra Calmy, Beatriz Grinsztejn, Andrew Grulich, Adeeba Kamarulzaman, Nagalingeswaran Kumarasamy, Mona R. Loutfy, Kamal M. El Filali, Souleymane Mboup, Julio S.G. Montaner, Paula Munderi, Vadim Pokrovsky, AnneMieke Vandamme, Benjamin Young, and Peter GodfreyFaussett, 2018. &ldquo;Expert consensus statement on the science of HIV in the context of criminal law,&rdquo; <em>Journal of the International AIDS Society</em>, volume 21, number 7.<br>doi: <a href="https://doi.org/10.1002/jia2.25161" target="_blank">https://doi.org/10.1002/jia2.25161</a>, accessed 5 September 2020.</p>
+
+<p>Eric P.S. Baumer and M. Six Silberman, 2011. &ldquo;When the implication is not to design (technology),&rdquo; <em>CHI &rsquo;11: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 2,271&ndash;2,274.<br>doi: <a href="https://doi.org/10.1145/1978942.1979275" target="_blank">https://doi.org/10.1145/1978942.1979275</a>, accessed 5 September 2020.</p>
+
+<p>Allan M Brandt, 1987. <em>No magic bullet: A social history of venereal disease in the United States since 1880</em>. Expanded edition. Oxford: Oxford University Press.</p>
+
+<p>Scott Burris and Edwin Cameron, 2008. &ldquo;The case against criminalization of HIV transmission,&rdquo; <em>Journal of the American Medical Association</em>, volume 300, number 5, pp. 578&ndash;581.<br>doi: <a href="https://doi.org/10.1001/jama.300.5.578" target="_blank">https://doi.org/10.1001/jama.300.5.578</a>, accessed 5 September 2020.</p>
+
+<p>Center for Disease Control and Prevention, 2019a. &ldquo;HIV and STD criminal laws,&rdquo; at <a href="https://www.cdc.gov/hiv/policies/law/states/exposure.html" target="_blank">https://www.cdc.gov/hiv/policies/law/states/exposure.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019b. &ldquo;HIV surveillance reports,&rdquo; at <a href="https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html" target="_blank">https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019c. &ldquo;HIV and women,&rdquo; at <a href="https://www.cdc.gov/hiv/group/gender/women/" target="_blank">https://www.cdc.gov/hiv/group/gender/women/</a>, accessed 5 September 2020.</p>
+
+<p>Center for HIV Law &amp; Policy, 2019. &ldquo;HIV criminalization in The United States,&rdquo; at <a href="http://www.hivlawandpolicy.org/sourcebook" target="_blank">http://www.hivlawandpolicy.org/sourcebook</a>, accessed 2 February 2020.</p>
+
+<p>Hollie Clark, Aruna Surendera Babu, Ellen Weiss Wiewel, Jenevieve Opoku, and Nicole Crepaz, 2017. &ldquo;Diagnosed HIV infection in transgender adults and adolescents: Results from the National HIV Surveillance System, 2009&ndash;2014,&rdquo; <em>AIDS and Behavior</em>, volume 21 number 9, pp. 2,774&ndash;2,783.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1656-7" target="_blank">https://doi.org/10.1007/s10461-016-1656-7</a>, accessed 5 September 2020.</p>
+
+<p>Sascha Cohen, 2018. &ldquo;How gay activists challenged the politics of civility,&rdquo; <em>Smithsonian Magazine</em> (10 July), at <a href="https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/" target="_blank">https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2004. &ldquo;Reasons for HIV disclosure/nondisclosure in close relationships: Testing a model of HIVdisclosure decision making,&rdquo; <em>Journal of Social and Clinical Psychology</em>, volume 23, number 6, pp. 747&ndash;767.<br>doi: <a href="https://doi.org/10.1521/jscp.23.6.747.54804" target="_blank">https://doi.org/10.1521/jscp.23.6.747.54804</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2002. &ldquo;Perceived HIV-related stigma and HIV disclosure to relationship partners after finding out about the seropositive diagnosis,&rdquo; <em>Journal of Health Psychology</em>, volume 7, number 4, pp. 415&ndash;432.<br>doi: <a href="https://doi.org/10.1177/1359105302007004330" target="_blank">https://doi.org/10.1177/1359105302007004330</a>, accessed 5 September 2020.</p>
+
+<p>Lynn Dombrowski, Ellie Harmon, and Sarah Fox, 2016. &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments,&rdquo; <em>DIS &rsquo;16: Proceedings of the 2016 ACM Conference on Designing Interactive Systems</em>, pp. 656&ndash;671.<br>doi: <a href="https://doi.org/10.1145/2901790.2901861" target="_blank">https://doi.org/10.1145/2901790.2901861</a>, accessed 5 September 2020.</p>
+
+<p>Robert W. Eisinger, Carl W. Dieffenbach, and Anthony S. Fauci, 2019. &ldquo;HIV viral load and transmissibility of HIV infection: Undetectable equals untransmittable,&rdquo; <em>Journal of the American Medical Association</em>, volume 321, number 5, pp. 451&ndash;452.<br>doi: <a href="https://doi.org/10.1001/jama.2018.21167" target="_blank">https://doi.org/10.1001/jama.2018.21167</a>, accessed 5 September 2020.</p>
+
+<p>Richard Elliot, 2002. &ldquo;Criminal law, public health and HIV transmission: A policy options paper,&rdquo; <em>UNAIDS (Joint United Nations Programme on HIV/AIDS)</em>, at <a href="https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf" target="_blank">https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf</a>, accessed 5 September 2020.</p>
+
+<p>Elizabeth F. Emens, 2008. &ldquo;Intimate discrimination: The state&rsquo;s role in the accidents of sex and love,&rdquo; <em>Harvard Law Review</em>, volume 122, number 5, pp. 1,307&ndash;1,402.<br>doi: <a href="https://doi.org/10.2307/40379752" target="_blank">https://doi.org/10.2307/40379752</a>, accessed 5 September 2020.</p>
+
+<p>Steven Epstein, 1996. <em>Impure science: AIDS, activism, and the politics of knowledge</em>. Berkeley: University of California Press.</p>
+
+<p>Amy L. Fairchild, Ronald Bayer, and James Colgrove, with Daniel Wolfe, 2007. <em>Searching eyes: Privacy, the state, and disease surveillance in America</em>. Berkeley: University of California Press.</p>
+
+<p>Mary D. Fan, 2012. &ldquo;Decentralizing STD surveillance: Toward better informed sexual consent,&rdquo; <em>Yale Journal of Health Policy, Law, and Ethics</em>, volume 12, number 1, pp. 1&ndash;38.</p>
+
+<p>Mary D. Fan, 2011. &ldquo;Sex, privacy, and public health in a casual encounters culture,&rdquo; <em>University of California Davis Law Review</em>, volume 25, pp. 531&ndash;596.</p>
+
+<p>Tim Fitzsimons, 2019. &ldquo;Inside Grindr, fears that China wanted to access user data via HIV research,&rdquo; <em>NBC News</em> (2 April), at <a href="https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996" target="_blank">https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996</a>, accessed 5 September 2020.</p>
+
+<p>Chandra L. Ford, Kathryn D. Whetten, Susan A. Hall, Jay S. Kaufman, and Angela D. Thrasher, 2007. &ldquo;Black sexuality, social construction, and research targeting &lsquo;The Down Low&rsquo; (&lsquo;The DL&rsquo;),&rdquo; <em>Annals of Epidemiology</em>, volume 17, number 3, pp. 209&ndash;216.<br>doi: <a href="https://doi.org/10.1016/j.annepidem.2006.09.006" target="_blank">https://doi.org/10.1016/j.annepidem.2006.09.006</a>, accessed 5 September 2020.</p>
+
+<p>A.J. Fortin, 1995. &ldquo;AIDS, surveillance, and public policy,&rdquo; <em>Research in Law and Policy Studies</em>, volume 4, pp. 173&ndash;197.</p>
+
+<p>Marilou Gagnon, 2012. &ldquo;Toward a critical response to HIV criminalization: Remarks on advocacy and social justice,&rdquo; <em>Journal of the Association of Nurses in AIDS Care</em>, volume 23, number 1, pp. 11&ndash;15.<br>doi: <a href="https://doi.org/10.1016/j.jana.2011.08.012" target="_blank">https://doi.org/10.1016/j.jana.2011.08.012</a>, accessed 5 September 2020.</p>
+
+<p>Carol L. Galletly and Steven D. Pinkerton, 2006. &ldquo;Conflicting messages: How criminal HIV disclosure laws undermine public health efforts to control the spread of HIV,&rdquo; <em>AIDS and Behavior</em>, volume 10, number 5, pp. 451&ndash;461.<br>doi: <a href="https://doi.org/10.1007/s10461-006-9117-3" target="_blank">https://doi.org/10.1007/s10461-006-9117-3</a>, accessed 5 September 2020.</p>
+
+<p>C. Galletly, Z. Lazzarini, C. Sanders, and S.D. Pinkerton, 2014. &ldquo;Criminal HIV exposure laws: Moving forward,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp. 1,011&ndash;1,013.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0731-1" target="_blank">https://doi.org/10.1007/s10461-014-0731-1</a>, accessed 5 September 2020.</p>
+
+<p>Robert C. Gallo, 2006. &ldquo;A reflection on HIV/AIDS research after 25 years,&rdquo; <em>Retrovirology</em>, volume 3, article number 72.<br>doi: <a href="https://doi.org/10.1186/1742-4690-3-72" target="_blank">https://doi.org/10.1186/1742-4690-3-72</a>, accessed 5 September 2020.</p>
+
+<p>George Gallup, Jr. and Jim Castelli, 1987. &ldquo;Poll catalogs views on AIDS by religion,&rdquo; <em>Dallas Morning News</em> (27 September), p. 45A.</p>
+
+<p>Lawrence O. Gostin, Scott Burris, and Zita Lazzarini, 1999. &ldquo;The law and the public&rsquo;s health: A study of infectious disease law in the United States,&rdquo; <em>Columbia Law Review</em>, volume 99, number 1, pp. 59&ndash;128.</p>
+
+<p>Ben Green, 2018. &ldquo;Data science as political action: Grounding data science in a politics of justice,&rdquo; <em>arXiv</em>:1811.03435 (6 November), at <a href="https://arxiv.org/abs/1811.03435" target="_blank">https://arxiv.org/abs/1811.03435</a>, accessed 5 September 2020.</p>
+
+<p>Kathryn Greene, Valerian J. Derlega, Gust A. Yep, and Sandra Petronio, 2003. <em>Privacy and disclosure of HIV in interpersonal relationships: A sourcebook for researchers and practitioners</em>. Mahwah, N.J.: Lawrence Erlbaum Associates.</p>
+
+<p>David M. Halperin, 2015. &ldquo;The biopolitics of HIV prevention discourse,&rdquo; In: Vernon W. Cisney and Nicolae Morar (editors). <em>Biopower: Foucault and beyond</em>. Chicago: University of Chicago Press, pp. 199&ndash;227.</p>
+
+<p>David M. Halperin and Trevor Hoppe (editors), 2017. <em>The war on sex</em>. Durham, N.C.: Duke University Press.</p>
+
+<p>Mark J. Handel and Irina Shklovski, 2012. &ldquo;Disclosure, ambiguity and risk reduction in real-time dating sites,&rdquo; <em>GROUP &rsquo;12: Proceedings of the 17th ACM International Conference on Supporting Group Work</em>, pp. 175&ndash;178.<br>doi: <a href="https://doi.org/10.1145/2389176.2389203" target="_blank">https://doi.org/10.1145/2389176.2389203</a>, accessed 5 September 2020.</p>
+
+<p>Jean Hardy and Silvia Lindtner, 2017. &ldquo;Constructing a desiring user: Discourse, rurality, and design in location-based social networks,&rdquo; <em>CSCW &rsquo;17: Proceedings of the 2017 ACM Conference on Computer Supported Cooperative Work and Social Computing</em>, pp. 13&ndash;25.<br>doi: <a href="https://doi.org/10.1145/2998181.2998347" target="_blank">https://doi.org/10.1145/2998181.2998347</a>, accessed 5 September 2020.</p>
+
+<p>Dini Harsono, Carol L. Galletly, Elaine O&rsquo;Keefe, and Zita Lazzarini, 2017. &ldquo;Criminalization of HIV exposure: A review of empirical studies in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 21, no. 1, pp. 27&ndash;50.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1540-5" target="_blank">https://doi.org/10.1007/s10461-016-1540-5</a>, accessed 5 September 2020.</p>
+
+<p>Trevor Hoppe, 2018. <em>Punishing disease: HIV and the criminalization of sickness</em>. Berkeley: University of California Press.</p>
+
+<p>Hsiu-Fang Hsieh and Sarah E. Shannon, 2005. &ldquo;Three approaches to qualitative content analysis,&rdquo; <em>Qualitative Health Research</em>, volume 15, number 9, pp. 1,277&ndash;1,288.<br>doi: <a href="https://doi.org/10.1177/1049732305276687" target="_blank">https://doi.org/10.1177/1049732305276687</a>, accessed 5 September 2020.</p>
+
+<p>Jevan A. Hutson, Jessie G. Taft, Solon Barocas, and Karen Levy, 2018. &ldquo;Debiasing desire: Addressing bias &amp; discrimination on intimate platforms,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 73.<br>doi: <a href="https://doi.org/10.1145/3274342" target="_blank">https://doi.org/10.1145/3274342</a>, accessed 5 September 2020.</p>
+
+<p>Lilly Irani, Janet Vertesi, Paul Dourish, Kavita Philip, and Rebecca E. Grinter, 2010. &ldquo;Postcolonial computing: A lens on design and development,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,311&ndash;1,320.<br>doi: <a href="https://doi.org/10.1145/1753326.1753522" target="_blank">https://doi.org/10.1145/1753326.1753522</a>, accessed 5 September 2020.</p>
+
+<p>Steven J. Jackson, Tarleton Gillespie, and Sandy Payette, 2014. &ldquo;The policy knot: Re-integrating policy, practice and design in cscw studies of social computing,&rdquo; <em>CSCW &rsquo;14: Proceedings of the 17th ACM Conference on Computer Supported Cooperative Work &amp; Social Computing</em>, pp. 588&ndash;602.<br>doi: <a href="https://doi.org/10.1145/2531602.2531674" target="_blank">https://doi.org/10.1145/2531602.2531674</a>, accessed 5 September 2020.</p>
+
+<p>Paula C. Johnson, 1992. &ldquo;Silence equals death: The response to AIDS within communities of color,&rdquo; <em>University of Illinois Law Review</em>, volume 1992, pp. 1,075&ndash;1,083.</p>
+
+<p>Ralf J&uuml;rgens, Jonathan Cohen, Edwin Cameron, Scott Burris, Michaela Clayton, Richard Elliott, Richard Pearshouse, Anne Gathumbi, and Delme Cupido, 2009. &ldquo;Ten reasons to oppose the criminalization of HIV exposure or transmission,&rdquo; <em>Reproductive Health Matters</em>, volume 17, number 34, pp. 163&ndash;172.<br>doi: <a href="https://doi.org/10.1016/S0968-8080(09)34462-6" target="_blank">https://doi.org/10.1016/S0968-8080(09)34462-6</a>, accessed 5 September 2020.</p>
+
+<p>Gopinaath Kannabiran, Shaowen Bardzell, and Jeffrey Bardzell, 2012. &ldquo;Designing (for) desire: a critical study of technosexuality in HCI,&rdquo; <em>NordiCHI &rsquo;12: Proceedings of the Seventh Nordic Conference on Human-Computer Interaction: Making Sense Through Design</em>, pp. 655&ndash;664.<br>doi: <a href="https://doi.org/10.1145/2399016.2399116" target="_blank">https://doi.org/10.1145/2399016.2399116</a>, accessed 5 September 2020.</p>
+
+<p>C&eacute;cile Kazatchkine, Edwin Bernard, and Patrick Eba, 2015. &ldquo;Ending overly broad HIV criminalization: Canadian scientists and clinicians stand for justice,&rdquo; <em>Journal of the International AIDS Society</em>, volume 18, number 1, pp. 201&ndash;226.<br>doi: <a href="https://doi.org/10.7448/IAS.18.1.20126" target="_blank">https://doi.org/10.7448/IAS.18.1.20126</a>, accessed 5 September 2020.</p>
+
+<p>Os Keyes, Jevan Hutson, and Meredith Durbin, 2019. &ldquo;A mulching proposal: Analysing and improving an algorithmic system for turning the elderly into high-nutrient slurry,&rdquo; <em>CHI EA &rsquo;19: Extended Abstracts of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number alt06.<br>doi: <a href="https://doi.org/10.1145/3290607.3310433" target="_blank">https://doi.org/10.1145/3290607.3310433</a>, accessed 5 September 2020.</p>
+
+<p>Jeffrey V. Lazarus, Kelly Safreed-Harmon, Simon E. Barton, Dominique Costagliola, Nikos Dedes, Julia del Amo Valero, Jose M. Gatell, Ricardo Baptista-Leite, Lus Mend&atilde;o, Kholoud Porter, Stefano Vella, and J&uuml;rgen Kurt Rockstroh, 2016. &ldquo;Beyond viral suppression of HIV &mdash; The new quality of life frontier,&rdquo; <em>BMC Medicine</em>, volume 14, number 1, article number 94.<br>doi: <a href="https://doi.org/10.1186/s12916-016-0640-4" target="_blank">https://doi.org/10.1186/s12916-016-0640-4</a>, accessed 5 September 2020.</p>
+
+<p>J. Stan Lehman, Meredith H. Carr, Allison J. Nichol, Alberto Ruisanchez, David W. Knight, Anne E. Langford, Simone C. Gray, and Jonathan H. Mermin, 2014. &ldquo;Prevalence and public health implications of state laws that criminalize potential HIV exposure in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp.997&ndash;1,006.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0724-0" target="_blank">https://doi.org/10.1007/s10461-014-0724-0</a>, accessed 5 September 2020.</p>
+
+<p>Karen Levy and Solon Barocas, 2018. &ldquo;Designing against discrimination in online markets,&rdquo; <em>Berkeley Technology Law Journal</em>, volume 32, number 3, pp. 1,183&ndash;1,237.<br>doi: <a href="https://doi.org/10.15779/Z38BV79V7K" target="_blank">https://doi.org/10.15779/Z38BV79V7K</a>, accessed 5 September 2020.</p>
+
+<p>Eric Lichtblau and William M. Arkin, 2014. &ldquo;More federal agencies are using undercover operations,&rdquo; <em>New York Times</em> (15 November), at <a href="https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html" target="_blank">https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html</a>, accessed 5 September 2020.</p>
+
+<p>Ann Light, 2011. &ldquo;HCI as heterodoxy: Technologies of identity and the queering of interaction with computers,&rdquo; <em>Interacting with Computers</em>, volume 23, number 5, pp. 430&ndash;438.<br>doi: <a href="https://doi.org/10.1016/j.intcom.2011.02.002" target="_blank">https://doi.org/10.1016/j.intcom.2011.02.002</a>, accessed 5 September 2020.</p>
+
+<p>Ben Light, Jean Burgess, and Stefanie Duguay, 2018. &ldquo;The walkthrough method: An approach to the study of apps,&rdquo; <em>New Media &amp; Society</em>, volume 20, number 3, pp. 881&ndash;900.<br>doi: <a href="https://doi.org/10.1177/1461444816675438" target="_blank">https://doi.org/10.1177/1461444816675438</a>, accessed 5 September 2020.</p>
+
+<p>Anish P. Mahajan, Jennifer N. Sayles, Vishal A. Patel, Robert H. Remien, Daniel Ortiz, Greg Szekeres, and Thomas J. Coates, 2008. &ldquo;Stigma in the HIV/AIDS epidemic: A review of the literature and recommendations for the way forward,&rdquo; <em>AIDS</em>, volume 22, supplement 2, pp. S67&ndash;S79.<br>doi: <a href="https://doi.org/10.1097/01.aids.0000327438.13291.62" target="_blank">https://doi.org/10.1097/01.aids.0000327438.13291.62</a>, accessed 5 September 2020.</p>
+
+<p>Alexandra McCallum, 2014. &ldquo;Criminalizing the transmission of HIV: Consent, disclosure, and online dating,&rdquo; <em>Utah Law Review</em>, volume 2014, number 3, article 5, at <a href="https://dc.law.utah.edu/ulr/vol2014/iss3/5" target="_blank">https://dc.law.utah.edu/ulr/vol2014/iss3/5</a>, accessed 5 September 2020.</p>
+
+<p>Donna Hubbard McCree and Matthew Hogben, 2010. &ldquo;The contribution to and context of other sexually transmitted diseases and tuberculosis in the HIV/AIDS epidemic among African Americans,&rdquo; In: Donna Hubbard McCree, Kenneth Jones, and Ann O&rsquo;Leary (editors). <em>African Americans and HIV/AIDS: Understanding and addressing the epidemic</em>, New York: Springer, pp. 3&ndash;12.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>William C. Miller, Carol A. Ford, Martina Morris, Mark S. Handcock, John L. Schmitz, Marcia M. Hobbs, Myron S. Cohen, Kathleen Mullan Harris, and J. Richard Udry, 2004. &ldquo;Prevalence of chlamydial and gonococcal infections among young adults in the United States,&rdquo; <em>Journal of the American Medical Association</em>, volume 291, number 18, pp. 2,229&ndash;2,236.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>Viviane Namaste, 2015. <em>Oversight: Critical reflections on feminist research and politics</em>. Toronto: Women&rsquo;s Press.</p>
+
+<p>Angela Perone, 2013. &ldquo;From punitive to proactive: An alternative approach for responding to HIV criminalization that departs from penalizing marginalized communities,&rdquo; <em>Hastings Women&rsquo;s Law Journal</em>, volume 24, pp. 363&ndash;406, and at <a href="https://repository.uchastings.edu/hwlj/vol24/iss2/5" target="_blank">https://repository.uchastings.edu/hwlj/vol24/iss2/5</a>, accessed 5 September 2020.</p>
+
+<p>Deana A. Pollard, 2006. &ldquo;Sex torts,&rdquo; <em>Minnesota Law Review</em>, volume 91, pp. 769&ndash;824, and at <a href="https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf" target="_blank">https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf</a>, accessed 5 September 2020.</p>
+
+<p>POZ, 2015. &ldquo;Man with HIV arrested for seeking sex on social media&rdquo;(22 July 22), at <a href="https://www.poz.com/article/stlouis-hiv-arrest-27534-4846" target="_blank">https://www.poz.com/article/stlouis-hiv-arrest-27534-4846</a>, accessed 5 September 2020.</p>
+
+<p>Russell K. Robinson, 2007. &ldquo;Structural dimensions of romantic preferences,&rdquo; <em>Fordham Law Review</em>, volume 76, pp. 2,787&ndash;2,820, and at <a href="http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/" target="_blank">http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/</a>, accessed 5 September 2020.</p>
+
+<p>Michael J. Rosenfeld and Reuben J. Thomas, 2012. &ldquo;Searching for a mate: The rise of the Internet as a social intermediary,&rdquo; <em>American Sociological Review</em>, volume 77, number 4, pp. 523&ndash;547.<br>doi: <a href="https://doi.org/10.1177/0003122412448050" target="_blank">https://doi.org/10.1177/0003122412448050</a>, accessed 5 September 2020.</p>
+
+<p>B.R. Simon Rosser, J. Michael Wilkerson, Derek J. Smolenski, J. Michael Oakes, Joseph Konstan, Keith J. Horvath, Gunna R. Kilian, David S. Novak, Gene P. Danilenko, and Richard Morgan, 2011. &ldquo;The future of Internet-based HIV prevention: A report on key findings from the Men&rsquo;s INTernet (MINTS-I, II) Sex Studies,&rdquo; <em>AIDS and Behavior</em>, volume 15, supplement 1, pp. S91&ndash;S100.<br>doi: <a href="https://doi.org/10.1007/s10461-011-9910-5" target="_blank">https://doi.org/10.1007/s10461-011-9910-5</a>, accessed 5 September 2020.</p>
+
+<p>Brian Schram, 2019. &ldquo;Accidental orientations: Rethinking queerness in archival times,&rdquo; <em>Surveillance &amp; Society</em>, volume 17, number 5, pp. 602&ndash;617.<br>doi: <a href="https://doi.org/10.24908/ss.v17i5.8688" target="_blank">https://doi.org/10.24908/ss.v17i5.8688</a>, accessed 5 September 2020.</p>
+
+<p>Junichi P. Semitsu, 2011. &ldquo;From Facebook to mug shot: How the dearth of social networking privacy rights revolutionized online government surveillance,&rdquo; <em>Pace Law Review</em>, volume 31, number 1, pp. 291&ndash;381, and at <a href="https://digitalcommons.pace.edu/plr/vol31/iss1/7" target="_blank">https://digitalcommons.pace.edu/plr/vol31/iss1/7</a>, accessed 5 September 2020.</p>
+
+<p>Sero Project, 2012, &ldquo;National criminalization survey preliminary results,&rdquo; (25 July), at <a href="https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/" target="_blank">https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/</a>, accessed 30 August 2019.</p>
+
+<p>Julianne M. Serovich and Katie E. Mosack, 2003. &ldquo;Reasons for HIV disclosure or nondisclosure to casual sexual partners,&rdquo; <em>AIDS Education and Prevention</em>, volume 15, number 1, pp. 70&ndash;80.</p>
+
+<p>Natasha Singer, 2018. &ldquo;Grindr sets off privacy firestorm after sharing users&rsquo; H.I.V.-status data,&rdquo; <em>New York Times</em> (3 April), at <a href="https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html" target="_blank">https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html</a>, accessed 5 September 2020.</p>
+
+<p>Lucy Suchman, 2011. &ldquo;Anthropological relocations and the limits of design,&rdquo; <em>Annual Review of Anthropology</em>, volume 40, pp. 1&ndash;18.<br>doi: <a href="https://doi.org/10.1146/annurev.anthro.041608.105640" target="_blank">https://doi.org/10.1146/annurev.anthro.041608.105640</a>, accessed 5 September 2020.</p>
+
+<p>Cass R. Sunstein, 1996. &ldquo;Social norms and social roles,&rdquo; <em>Columbia Law Review</em>, volume 96, number 4, pp. 903&ndash;968.</p>
+
+<p>Patricia Sweeney, Simone C. Gray, David W. Purcell, Jenny Sewell, Aruna Surendera Babu, Brett A. Tarver, Joseph Prejean, and Jonathan Mermin, 2017. &ldquo;Association of HIV diagnosis rates and laws criminalizing HIV exposure in the United States,&rdquo; <em>AIDS</em>, volume 31, number 10, pp. 1,483&ndash;1,488.<br>doi: <a href="https://doi.org/10.1097/QAD.0000000000001501" target="_blank">https://doi.org/10.1097/QAD.0000000000001501</a>, accessed 5 September 2020.</p>
+
+<p>Bryan L. Sykes, Trevor A. Hoppe, and Kristen D. Maziarka, 2016. &ldquo;Cruel intentions? HIV prevalence and criminalization during an age of mass incarceration, U.S. 1999 to 2012,&rdquo; <em>Medicine (Baltimore)</em>, volume 95, number 16, e3352.<br>doi: <a href="https://doi.org/10.1097/MD.0000000000003352" target="_blank">https://doi.org/10.1097/MD.0000000000003352</a>, accessed 5 September 2020.</p>
+
+<p>Samuel Hardman Taylor, Jevan Alexander Hutson, and Tyler Richard Alicea, 2017. &ldquo;Social consequences of Grindr use: Extending the Internet-enhanced self-disclosure hypothesis,&rdquo; <em>CHI &rsquo;17: Proceedings of the 2017 CHI Conference on Human Factors in Computing Systems</em>, pp. 6,645&ndash;6,657.<br>doi: <a href="https://doi.org/10.1145/3025453.3025775" target="_blank">https://doi.org/10.1145/3025453.3025775</a>, accessed 5 September 2020.</p>
+
+<p>Steven Thrasher, 2015. &ldquo;A Black body on trial: The conviction of HIV-positive &lsquo;Tiger Mandingo&rsquo;,&rdquo; <em>BuzzFeed News</em> (30 November), at <a href="https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m" target="_blank">https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m</a>, accessed 5 September 2020.</p>
+
+<p>Liming Wang, Dylan Podson, Zihuang Chen, Hongyan Lu, Vania Wang, Colin Shepard, John K. Williams, and Guodong Mi, 2019. &ldquo;Using social media to increase HIV testing among men who have sex with men &mdash; Beijing, China, 2013&ndash;2017,&rdquo; <em>Morbidity and Mortality Weekly Report</em>, volume 68, number 21, pp. 478&ndash;482.<br>doi: <a href="http://dx.doi.org/10.15585/mmwr.mm6821a3" target="_blank">http://dx.doi.org/10.15585/mmwr.mm6821a3</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward. 2005. &ldquo;Partner notification and contact-tracing,&rdquo; <em>Medicine</em>, volume 33, number 9, pp. 28&ndash;30.<br>doi: <a href="https://doi.org/10.1383/medc.2005.33.9.28" target="_blank">https://doi.org/10.1383/medc.2005.33.9.28</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward and Gill Bell, 2014. &ldquo;Partner notification,&rdquo; <em>Medicine (Abingdon)</em>, volume 42, number 6, pp. 314&ndash;317.<br>doi: <a href="https://doi.org/10.1016/j.mpmed.2014.03.013" target="_blank">https://doi.org/10.1016/j.mpmed.2014.03.013</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Andreas Gutmann, M. Angela Sasse, and Ann Blandford, 2018. &ldquo;Privacy unraveling around explicit HIV status disclosure fields in the online geosocial hookup app Grindr,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 181.<br>doi: <a href="https://doi.org/10.1145/3274450" target="_blank">https://doi.org/10.1145/3274450</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Juan F. Maestre, Jo Gibbs, Chia-Fang Chung, and Ann Blandford, 2019. &ldquo;Signal appropriation of explicit HIV status disclosure fields in sex-social apps used by gay and bisexual men,&rdquo; <em>CHI &rsquo;19: Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number 692.<br>doi: <a href="https://doi.org/10.1145/3290605.3300922" target="_blank">https://doi.org/10.1145/3290605.3300922</a>, accessed 5 September 2020.</p>
+
+<p>Dylan Eric Wittkower, 2016. &ldquo;Lurkers, creepers, and virtuous interactivity: From property rights to consent to care as a conceptual basis for privacy concerns and information ethics,&rdquo; <em>First Monday</em>, volume 21, number 10, at <a href="https://firstmonday.org/article/view/6948/5628" target="_blank">https://firstmonday.org/article/view/6948/5628</a>, accessed 5 September 2020.<br>doi: <a href="https://doi.org/10.5210/fm.v21i10.6948" target="_blank">https://doi.org/10.5210/fm.v21i10.6948</a>, accessed 5 September 2020.</p>
+
+<p>Dan Wohlfeiler, Jennifer Hecht, Jonathan Volk, H. Fisher Raymond, Tom Kennedy, and Willi McFarland, 2013. &ldquo;How can we improve online HIV and STD prevention for men who have sex with men? Perspectives of hook-up website owners, website users, and HIV/STD directors,&rdquo; <em>AIDS and Behavior</em>, volume 17, number 9, pp. 3,024&ndash;3,033.<br>doi: <a href="https://doi.org/10.1007/s10461-012-0375-y" target="_blank">https://doi.org/10.1007/s10461-012-0375-y</a>, accessed 5 September 2020.</p>
+
+<p>Mara Cecilia Zea, Carol A. Reisen, Paul J. Poppen, and Rafael M. Daz. 2003. &ldquo;Asking and telling: communication about HIV status among Latino HIV-positive gay men,&rdquo; <em>AIDS and Behavior</em>, volume 7, number 2, pp. 143&ndash;152.<br>doi: <a href="https://doi.org/10.1023/A:1023994207984" target="_blank">https://doi.org/10.1023/A:1023994207984</a>, accessed 5 September 2020.</p>
+
+<p>Shoshana Zuboff, 2019. <em>The age of surveillance capitalism: The fight for a human future at the new frontier of power</em>. London: Profile Books.</p>
+
+<p>&nbsp;</p>
+<hr width="300">
+
+<p><strong>Editorial history</strong></p>
+<p>Received 17 October 2019; revised 12 February 2020; accepted 28 August 2020.</p>
+
+<hr>
+
+<p><a href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" src="https://i.creativecommons.org/l/by/4.0/80x15.png"></a><br>This paper is licensed under a <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</p>
+
+<p>Surveillance, stigma &amp; sociotechnical design for HIV<br>by Calvin Liang, Jevan Alexander Hutson, and Os Keyes.<br><em>First Monday</em>, Volume 25, Number 10 - 5 October 2020<br>https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729<br>doi: <a href="http://dx.doi.org/10.5210/fm.v25i10.10274" target="_blank">http://dx.doi.org/10.5210/fm.v25i10.10274</a></p>
+</blockquote>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_landingpage.html b/python/tests/files/first_monday_ojs3_landingpage.html
new file mode 100644
index 0000000..2633256
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_landingpage.html
@@ -0,0 +1,616 @@
+ <!DOCTYPE html>
+<html lang="en-US" xml:lang="en-US">
+<head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ | First Monday
+ </title>
+
+
+<meta name="generator" content="Open Journal Systems 3.1.2.0">
+<link rel="icon" href="https://firstmonday.org/ojs/public/journals/3/favicon_en_US.gif">
+<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+<meta name="DC.Coverage" xml:lang="en" content=""/>
+<meta name="DC.Creator.PersonalName" content="Calvin Liang"/>
+<meta name="DC.Creator.PersonalName" content="Jevan Alexander Hutson"/>
+<meta name="DC.Creator.PersonalName" content="Os Keyes"/>
+<meta name="DC.Date.created" scheme="ISO8601" content="2020-09-10"/>
+<meta name="DC.Date.dateSubmitted" scheme="ISO8601" content="2019-09-15"/>
+<meta name="DC.Date.issued" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Date.modified" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Description" xml:lang="en" content="Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."/>
+<meta name="DC.Format" scheme="IMT" content="text/html"/>
+<meta name="DC.Identifier" content="10274"/>
+<meta name="DC.Identifier.DOI" content="10.5210/fm.v25i10.10274"/>
+<meta name="DC.Identifier.URI" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="DC.Language" scheme="ISO639-1" content="en"/>
+<meta name="DC.Rights" content="Copyright (c) 2020 First Monday"/>
+<meta name="DC.Rights" content=""/>
+<meta name="DC.Source" content="First Monday"/>
+<meta name="DC.Source.ISSN" content="1396-0466"/>
+<meta name="DC.Source.URI" content="https://firstmonday.org/ojs/index.php/fm"/>
+<meta name="DC.Subject" xml:lang="en" content="HIV"/>
+<meta name="DC.Subject" xml:lang="en" content="online dating"/>
+<meta name="DC.Subject" xml:lang="en" content="design"/>
+<meta name="DC.Subject" xml:lang="en" content="policy"/>
+<meta name="DC.Subject" xml:lang="en" content="surveillance"/>
+<meta name="DC.Subject" xml:lang="en" content="intimacy"/>
+<meta name="DC.Subject" xml:lang="en" content="social computing"/>
+<meta name="DC.Subject" xml:lang="en" content="social justice"/>
+<meta name="DC.Title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="DC.Type" content="Text.Serial.Journal"/>
+<meta name="DC.Type" xml:lang="en" content="Qualitative; Content analysis"/>
+<meta name="DC.Type.articleType" content="Articles"/>
+<meta name="gs_meta_revision" content="1.1"/>
+<meta name="citation_journal_title" content="First Monday"/>
+<meta name="citation_journal_abbrev" content="1"/>
+<meta name="citation_issn" content="1396-0466"/>
+<meta name="citation_author" content="Calvin Liang"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_author" content="Jevan Alexander Hutson"/>
+<meta name="citation_author_institution" content="University of Washington, School of Law"/>
+<meta name="citation_author" content="Os Keyes"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="citation_date" content="2020/09/10"/>
+<meta name="citation_doi" content="10.5210/fm.v25i10.10274"/>
+<meta name="citation_abstract_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="citation_language" content="en"/>
+<meta name="citation_keywords" xml:lang="en" content="HIV"/>
+<meta name="citation_keywords" xml:lang="en" content="online dating"/>
+<meta name="citation_keywords" xml:lang="en" content="design"/>
+<meta name="citation_keywords" xml:lang="en" content="policy"/>
+<meta name="citation_keywords" xml:lang="en" content="surveillance"/>
+<meta name="citation_keywords" xml:lang="en" content="intimacy"/>
+<meta name="citation_keywords" xml:lang="en" content="social computing"/>
+<meta name="citation_keywords" xml:lang="en" content="social justice"/>
+<meta name="citation_fulltext_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"/>
+<link rel="alternate" type="application/atom+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+<link rel="alternate" type="application/rdf+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+<link rel="alternate" type="application/rss+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <link rel="stylesheet" href="https://firstmonday.org/ojs/index.php/fm/$$$call$$$/page/page/css?name=stylesheet" type="text/css" /><link rel="stylesheet" href="//fonts.googleapis.com/css?family=Noto+Sans:400,400italic,700,700italic" type="text/css" /><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.css" type="text/css" /><link rel="stylesheet" href="https://firstmonday.org/ojs/public/journals/3/styleSheet.css" type="text/css" />
+</head>
+<body class="pkp_page_article pkp_op_view has_site_logo" dir="ltr">
+
+ <div class="cmp_skip_to_content">
+ <a href="#pkp_content_main">Skip to main content</a>
+ <a href="#pkp_content_nav">Skip to main navigation menu</a>
+ <a href="#pkp_content_footer">Skip to site footer</a>
+ </div>
+ <div class="pkp_structure_page">
+
+ <header class="pkp_structure_head" id="headerNavigationContainer" role="banner">
+ <div class="pkp_head_wrapper">
+
+ <div class="pkp_site_name_wrapper">
+ <div class="pkp_site_name">
+ <a href=" https://firstmonday.org/ojs/index.php/fm/index
+ " class="is_img">
+ <img src="https://firstmonday.org/ojs/public/journals/3/pageHeaderLogoImage_en_US.gif" width="252" height="102" alt="Page Header Logo" />
+ </a>
+ </div>
+ </div>
+
+
+ <nav class="pkp_navigation_primary_row" aria-label="Site Navigation">
+ <div class="pkp_navigation_primary_wrapper">
+ <ul id="navigationPrimary" class="pkp_navigation_primary pkp_nav_list">
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About
+ </a>
+ <ul>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About the Journal
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/editorialTeam">
+ Editorial Team
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/privacy">
+ Privacy Statement
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/contact">
+ Contact
+ </a>
+ </li>
+ </ul>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search">
+ Search
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/current">
+ Current
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/announcement">
+ Announcements
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/submissions">
+ Submissions
+ </a>
+ </li>
+ </ul>
+
+
+
+ <form class="pkp_search" action="https://firstmonday.org/ojs/index.php/fm/search/search" method="post" role="search">
+ <input type="hidden" name="csrfToken" value="671acac3a608346eb0eb4de1f26c7563">
+ <input name="query" value="" type="text" aria-label="Search Query">
+ <button type="submit">
+ Search
+ </button>
+ <div class="search_controls" aria-hidden="true">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search" class="headerSearchPrompt search_prompt" aria-hidden="true">
+ Search
+ </a>
+ <a href="#" class="search_cancel headerSearchCancel" aria-hidden="true"></a>
+ <span class="search_loading" aria-hidden="true"></span>
+ </div>
+</form>
+ </div>
+ </nav>
+
+ <nav class="pkp_navigation_user_wrapper" id="navigationUserWrapper" aria-label="User Navigation">
+ <ul id="navigationUser" class="pkp_navigation_user pkp_nav_list">
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/user/register">
+ Register
+ </a>
+ </li>
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/login">
+ Login
+ </a>
+ </li>
+ </ul>
+
+ </nav>
+ </div><!-- .pkp_head_wrapper -->
+ </header><!-- .pkp_structure_head -->
+
+ <div class="pkp_structure_content has_sidebar">
+ <div id="pkp_content_main" class="pkp_structure_main" role="main">
+
+<div class="page page_article">
+ <nav class="cmp_breadcrumbs" role="navigation" aria-label="You are here:">
+ <ol>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/index">
+ Home
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li class="current">
+ Articles
+ </li>
+ </ol>
+</nav>
+
+ <article class="obj_article_details">
+ <h1 class="page_title">
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ </h1>
+
+
+ <div class="row">
+ <div class="main_entry">
+
+ <ul class="item authors">
+ <li>
+ <span class="name">
+ Calvin Liang
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0002-3795-3441" target="_blank">
+ https://orcid.org/0000-0002-3795-3441
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Jevan Alexander Hutson
+ </span>
+ <span class="affiliation">
+ University of Washington, School of Law
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0003-3312-1733" target="_blank">
+ https://orcid.org/0000-0003-3312-1733
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Os Keyes
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0001-5196-609X" target="_blank">
+ https://orcid.org/0000-0001-5196-609X
+ </a>
+ </span>
+ </li>
+ </ul>
+
+ <div class="item doi">
+ <span class="label">
+ DOI:
+ </span>
+ <span class="value">
+ <a href="https://doi.org/10.5210/fm.v25i10.10274">
+ https://doi.org/10.5210/fm.v25i10.10274
+ </a>
+ </span>
+ </div>
+
+ <div class="item keywords">
+ <span class="label">
+ Keywords:
+ </span>
+ <span class="value">
+ HIV, online dating, design, policy, surveillance, intimacy, social computing, social justice </span>
+ </div>
+
+ <div class="item abstract">
+ <h3 class="label">Abstract</h3>
+ <p>Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+ </div>
+
+
+
+ <div class="item author_bios">
+ <h3 class="label">
+ Author Biographies
+ </h3>
+ <div class="sub_item">
+ <div class="label">
+ Calvin Liang, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ <p>Calvin Liang is a PhD student in Human-Centered Design and Engineering at The University of Washington. Their research broadly focuses on technology’s role in and out of queerness, health, and queer health.</p>
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Jevan Alexander Hutson, <span class="affiliation">University of Washington, School of Law</span>
+ </div>
+ <div class="value">
+ Jevan Hutson is a third-year law student and Gregoire Fellow at the University of Washington School of Law. He holds an M.P.S. from the Department of Information Science at Cornell University, and a B.A. from the Department of Art History and Visual Studies at Cornell University. He has been published in venues including the Association for Computing Machinery’s conferences on Computer Human Interaction and Computer Supported Cooperative Work and Social Computing
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Os Keyes, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ Os Keyes is a PhD student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.
+ </div>
+ </div>
+ </div>
+
+
+ </div><!-- .main_entry -->
+
+ <div class="entry_details">
+
+ <div class="item cover_image">
+ <div class="sub_item">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ <img src="https://firstmonday.org/ojs/public/journals/3/cover_issue_678_en_US.png" alt="“Frank Moore, Digital Divide, 2001 gouache, oil and mixed media on paper 14 3/4 x 24 1/4 inches (36,4 x 61,6 cm) sheetâ€">
+ </a>
+ </div>
+ </div>
+
+ <div class="item galleys">
+ <ul class="value galleys_links">
+ <li>
+
+
+
+
+<a class="obj_galley_link file" href="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729">
+
+
+ HTML
+
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="item published">
+ <div class="label">
+ Published
+ </div>
+ <div class="value">
+ 2020-09-10
+ </div>
+ </div>
+
+ <div class="item citation">
+ <div class="sub_item citation_display">
+ <div class="label">
+ How to Cite
+ </div>
+ <div class="value">
+ <div id="citationOutput" role="region" aria-live="polite">
+ <div class="csl-bib-body">
+ <div class="csl-entry">Liang, C., Hutson, J. A., &#38; Keyes, O. (2020). Surveillance, stigma &amp; sociotechnical design for HIV. <i>First Monday</i>, <i>25</i>(10). https://doi.org/10.5210/fm.v25i10.10274</div>
+</div>
+ </div>
+ <div class="citation_formats">
+ <button class="cmp_button citation_formats_button" aria-controls="cslCitationFormats" aria-expanded="false" data-csl-dropdown="true">
+ More Citation Formats
+ </button>
+ <div id="cslCitationFormats" class="citation_formats_list" aria-hidden="true">
+ <ul class="citation_formats_styles">
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274&amp;return=json"
+ >
+ ACM
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274&amp;return=json"
+ >
+ ACS
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274&amp;return=json"
+ >
+ APA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274&amp;return=json"
+ >
+ ABNT
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274&amp;return=json"
+ >
+ Chicago
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274&amp;return=json"
+ >
+ Harvard
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274&amp;return=json"
+ >
+ IEEE
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274&amp;return=json"
+ >
+ MLA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274&amp;return=json"
+ >
+ Turabian
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274&amp;return=json"
+ >
+ Vancouver
+ </a>
+ </li>
+ </ul>
+ <div class="label">
+ Download Citation
+ </div>
+ <ul class="citation_formats_styles">
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/ris?submissionId=10274">
+ <span class="fa fa-download"></span>
+ Endnote/Zotero/Mendeley (RIS)
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/bibtex?submissionId=10274">
+ <span class="fa fa-download"></span>
+ BibTeX
+ </a>
+ </li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="item issue">
+ <div class="sub_item">
+ <div class="label">
+ Issue
+ </div>
+ <div class="value">
+ <a class="title" href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ </div>
+ </div>
+
+ <div class="sub_item">
+ <div class="label">
+ Section
+ </div>
+ <div class="value">
+ Articles
+ </div>
+ </div>
+ </div>
+
+
+ <div class="item copyright">
+ <p>Authors retain copyright to their work published in <em>First Monday</em>. Please see the footer of each article for details.</p>
+ </div>
+
+
+
+ </div><!-- .entry_details -->
+ </div><!-- .row -->
+
+</article>
+
+
+
+</div><!-- .page -->
+
+ </div><!-- pkp_structure_main -->
+
+ <div class="pkp_structure_sidebar left" role="complementary" aria-label="Sidebar">
+ <div class="pkp_block block_developed_by">
+ <div class="content">
+ <a href="http://pkp.sfu.ca/ojs/">
+ Open Journal Systems
+ </a>
+ </div>
+</div>
+<div class="pkp_block block_web_feed">
+ <span class="title">Current Issue</span>
+ <div class="content">
+ <ul>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/atom.svg" alt="Atom logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss20_logo.svg" alt="RSS2 logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss10_logo.svg" alt="RSS1 logo">
+ </a>
+ </li>
+ </ul>
+ </div>
+</div>
+
+ </div><!-- pkp_sidebar.left -->
+ </div><!-- pkp_structure_content -->
+
+<div id="pkp_content_footer" class="pkp_structure_footer_wrapper" role="contentinfo">
+
+ <div class="pkp_structure_footer">
+
+ <div class="pkp_footer_content">
+ <p>A Great Cities Initiative of the University of Illinois at Chicago&nbsp;<a href="http://library.uic.edu/">University Library</a>.</p>
+<p>©&nbsp;<em>First Monday</em>, 1995-2020. ISSN&nbsp;1396-0466.</p>
+ </div>
+
+ <div class="pkp_brand_footer" role="complementary">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/aboutThisPublishingSystem">
+ <img alt="About this Publishing System" src="https://firstmonday.org/ojs/templates/images/ojs_brand.png">
+ </a>
+ </div>
+ </div>
+</div><!-- pkp_structure_footer_wrapper -->
+
+</div><!-- pkp_structure_page -->
+
+<script src="//ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js" type="text/javascript"></script><script src="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/jquery-ui.min.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/lib/pkp/js/lib/jquery/plugins/jquery.tag-it.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/popper/popper.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/util.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/dropdown.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/main.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/generic/citationStyleLanguage/js/articleCitation.js" type="text/javascript"></script><script type="text/javascript">
+(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+ga('create', 'UA-41314203-1', 'auto');
+ga('send', 'pageview');
+</script>
+
+
+</body>
+</html>
diff --git a/python/tests/files/genders_g58_fairlie.html b/python/tests/files/genders_g58_fairlie.html
new file mode 100644
index 0000000..49cada8
--- /dev/null
+++ b/python/tests/files/genders_g58_fairlie.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+<title>Genders OnLine Journal - Genders OnLine Journal - Presenting innovative theories in art, literature, history, music, TV and film.</title>
+<meta name="description" content="Analysis of Hitchcock’s Rope (1948) as a critique of heteromasculinity that thematizes queer anguish, orality, and women’s relationship to the covert world of homosexual knowledge.">
+<meta name="keywords" content="homosexuality, homophobia, Cold War, the closet, heteromasculinity, queer anguish, anus, suspicion, orality, eating, cannibalism, Catholicism, knowledge, the cinematic cut, cinematic reality, women in Hitchcock, women and gay men, lack, hypocrisy, straight male interlocutor.">
+<style type="text/css">
+<!--
+
+td {
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 13px;
+}
+
+.Section1 {
+ page:Section1;
+}
+-->
+</style>
+</head>
+<body alink="#000088" background="../image/back.jpg" vlink="#00aa00">
+<p>
+<table width="600">
+ <tbody>
+ <tr>
+ <td valign="top" width="90"><p><img src="../image/indlgo.gif" alt="Genders OnLine Journal" align="bottom" border="0" height="530" width="97"> </p></td>
+ <td align="right" valign="top" width="530"><table width="530">
+ <tbody>
+ <tr>
+ <td valign="top"><p><b><font size="2">Issue 58</font></b>, Fall 2013</p>
+ <p><font size="5"><strong>Reading Maeshowe</strong></font> <br>
+ Recovering the Feminine in a Neolithic Tomb</p>
+<p>By <strong>CHARLOTTE FAIRLIE</strong></p>
+ <p>[1] Cuween, a small Neolithic cairn, perches on top of a hill on the Orkney Mainland. A flashlight waits in a bucket by the door, and visitors crawl on hands and knees, one by one, into the pitch-black interior. After savoring a degree of darkness rare in modern life, they direct beams of light up the tapering walls to marvel at the skill of the stonemasons. It is impossible to resist the impulse to clamber into the chambers and crouch where the bones once lay. Green and smooth, Maeshowe, another Orkney cairn, rises enigmatically from the field where it has stood since around 2700 BC. The designation of this monument and the surrounding Neolithic structures as a UNESCO World Heritage Site (WHS) in 1999 significantly increased tourism to the area (Card et al. 429), so while visitors may still enter Cuween unsupervised, access to the much larger Maeshowe now requires a timed ticket, bought in advance. Throughout the year, thousands of visitors, bending uncomfortably low, shuffle through the tunnel-like passage entry, making the physical journey from light to dark and a more psychological journey from present to past. Exploring any of the Neolithic sites in Orkney is to bridge time, to feel kinship with those who built them.</p>
+ <p>[2] Without doubt, a major reason Maeshowe attracts so many people is its symbiotic relationship with its environment. Most famously, at sundown during the December solstice, the winter sun lines up with the door of the tomb, shines down the passage, and focuses its rays on the stone wall within. Interest in this phenomenon, the moment when the light stabs the darkness, is so high that Historic Scotland provides web-cam coverage, but Maeshowe fascinates others besides tourists and solstice celebrants. Whether they are vacation visitors, archaeologists, anthropologists, or poets, explorers experience the sites differently, applying their own intellectual tools and imagining Neolithic lives from their respective points of view. Leslie Riddoch has written that these are &ldquo;Stone Age marvels which inspire and astonish,&rdquo; and Simon W. Hall expresses the experiences of many when he refers to &ldquo;the profound impact of entering a tomb&rdquo; (160). They imply that to enter a cairn is to become one with it, to undergo a transformation. Maeshowe, which can now be experienced only under the regimented conditions required by the Historic Scotland guides, clearly retains extraordinary power to inspire. Indeed, this ancient mound has attracted a great deal of literary attention from both noted and obscure writers. Considering these cumulative interpretations, rather than relying solely on the work of archaeologists, opens up a more comprehensive, textured, and, indeed, gendered understanding of ancient history and our commonality with Neolithic peoples.</p>
+ <p> [3] George Mackay Brown, Kathleen Jamie, Myra Schneider, and Dilys Rose are four of the more prominent authors for whom Maeshowe has proven inspirational. They have experienced the tomb through a doubly imaginative process: first by reading it as they would read a poem and then by expressing that interpretation in writing. While Brown was an Orcadian, living most of his life alongside the Neolithic sites, Jamie, Schneider, and Rose, all of whom have Scottish roots, experience Maeshowe as tourists, drawn across the Pentland Firth to enter the passage and travel into the darkness. Significantly, all three of these more contemporary writers are women. Hall, in his valuable survey, <u>The History of Orkney Literature</u>, contrasts the use of the prehistoric by female Scottish writers with that of their male counterparts, stating that it is less political, that women authors take &ldquo;the opportunity to reestablish the place&mdash;and, significantly, the inner lives of women in the prehistoric or early historical northern landscape&rdquo; (162-163). I would argue, however, that their work also engages the public world to a greater extent and is more ideological than this statement implies. Jamie&rsquo;s, Schneider&rsquo;s, and Rose&rsquo;s experiences in Maeshowe lead to readings of the monument that build on the archaeological interpretations, allowing us to consider the possibility of ancient gender power struggles and raising our awareness of the deep roots of masculine dominance.</p>
+ <p>[4] Archaeologist Colin Richards, who has written extensively about The Heart of Neolithic Orkney WHS, describes how visiting cairns must also have affected prehistoric visitors: &ldquo;the journey will be one of consequence.&rdquo; Moving from the light of day to the dark mysteries of a tomb&rsquo;s interior &ldquo;is a passage from the profane to the sacred.&rdquo; As such, &ldquo;it will involve transformation&rdquo; (&ldquo;Doorways&rdquo; 70-71). However, the nature of the transformation is mysterious. Referring to single-chambered structures divided into stalls, he continues, &ldquo;If the Orkney-Cromarty &lsquo;chambered&rsquo; tombs are principally conceived as a series of doorways, the question arises: where are they leading? To what goal?&rdquo; (71). In discussing the relationship between buildings and the people who used them thousands of years ago, Richards considers the figurative significance of doors. In doing so, he treats the tombs as if they were literary texts with debatable meaning, having previously pointed out that &ldquo;the architecture of a chambered tomb relied on analogy and metaphor for its understanding and interpretation&rdquo; (&ldquo;Doorways&rdquo; 67). Rather than merely being repositories for bones, the tombs, Richards asserts, were &ldquo;built to be experienced visually, physically and imaginatively,&rdquo; an experience which may well result in some kind of &ldquo;revelation&rdquo; (&ldquo;Doorways.&rdquo; 69, 70, 76). Since he argues that buildings carry metaphoric meaning, open to imaginative interpretation, it is entirely appropriate that, when explaining this, Richards also changes to the historical present tense. His grammatical shift emphasizes that like <u>Beowulf</u>, <u>Hamlet</u>, or <u>Moby Dick</u>, tombs such as Maeshowe transcend time and are open to new readings, whether by trained archaeologists, pilgrims, casual visitors, or writers.</p>
+ <p>[5] Robert Crawford draws more explicit parallels between Maeshowe itself and literature in his essay, &ldquo;Maes Howe Sappho.&rdquo; Noting the continuing appeal of the tomb, how today &ldquo;people still treasure&rdquo; the moment that the sun lines up with the passage, he compares the ancient monument to poetry:</p><blockquote>However different we and our family groups, our tribes, have become, we can and do still savor that sense of alignment and attunement and have our own ways of articulating some sort of consonance between ourselves, our intimate groupings, and the universe that surrounds us. Though such patternings may be deconstructed, they seem to emerge from a deep need that recurs across generations, like a persistent internal rhyme, and poetry, this most nuanced way of making with words, is a way in which that need for attunement is repeatedly articulated through language. If prehistoric sites often appear to relate people to the stars and planets, then poems continue that impulse. (61)
+ </blockquote>
+ <p>Ancient tombs, then, prompt us to ponder our place in the universe, our identity as humans, and in that also they resemble literature. According to Kenneth Brophy, Neolithic monuments &ldquo;were and are locations that embodied the biography of the builders, users, spectators, and excavators&rdquo; (10). It follows that if we think of Maeshowe as a text, Brophy&rsquo;s assertion that the monument absorbs the &ldquo;biography&rdquo; of all who have used it or visited it, positions it as an example of intertextuality. Maeshowe has many constantly changing stories to tell to its different readers, and readers will respond differently to its figurative meanings.</p>
+ <p>[6] In a 1977 column for <u>The Orcadian</u> newspaper, George Mackay Brown describes how witnessing the midwinter solstice at Maeshowe affects him: &ldquo;Winter after winter I never cease to wonder at the way primitive man arranged, in hewn stone, such powerful symbolism&rdquo; (&ldquo;Maeshowe at Midwinter&rdquo; 88). Like Richards, Brown is emphasizing the figurative qualities of the structure, which he has further explored in poetry. However, the first of his 1999 &ldquo;Two Maeshowe Poems&rdquo; (often printed as a stand-alone) opens not at the tomb, but with an image of the neighboring stone circle, Brodgar. Perhaps surprising to most readers, this would resonate with archaeologists since current scholarship emphasizes that the sites comprising The Heart of Neolithic Orkney are not self-contained but exist and function in relation to one another and to the surrounding landscape (See &ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; 5). As such, they should not be interpreted as discrete entities. It is fitting, then, that Brown&rsquo;s poem moves seamlessly through a series of images that integrate Brodgar&rsquo;s &ldquo;light and darkness&rdquo; with Maeshowe&rsquo;s &ldquo;flowers [and] stone&rdquo; (a reference to the runic graffiti carved by Vikings inside the tomb) and &ldquo;skulls&rdquo; (Lines 1, 9, 11). The first word of the poem, &ldquo;Circle,&rdquo; is semantically echoed in the initial word of each ensuing stanza, &ldquo;Ring,&rdquo; &ldquo;Wheel,&rdquo; and &ldquo;Round,&rdquo; subtly shifting from the geometrically circular Brodgar to the tumescent mound of Maeshowe and emphasizing the cycle of &ldquo;life and death&rdquo; (7). For this is a poem about regeneration, how &ldquo;Out of those skulls / Breaks the first green shoot, the full ear, then the bread&rdquo; (11-12). Throughout, juxtaposed images look for the positive to outweigh the negative: &ldquo;We move in shadows,&rdquo; but &ldquo;Brodgar has burned on the moor a dance of sun&rdquo;; &ldquo;Ring of quern and plough&rdquo; (a quern is a stone for grinding grain) are charged to &ldquo;contain / Our tumults of blood&rdquo;; &ldquo;The stars&rsquo; chaos is caught in a strict rein&rdquo;; the word &ldquo;stone&rdquo; is enveloped by &ldquo;flowers,&rdquo; and &ldquo;beauty and love&rdquo;; similarly, &ldquo;snow&rdquo; is flanked by &ldquo;sun&rdquo; and &ldquo;seed.&rdquo; So darkness becomes light, destructive violence is subservient to the raising and grinding of grain for bread, order makes sense of the universe, the beautiful and the warm temper the hard and the cold, and new life will follow death.</p>
+ <p>[7] Brown&rsquo;s interpretation of these monuments, his use of the architectural circularity and roundness of the Ring of Brodgar and Maeshowe as metaphors for the lifecycle and the possibility of renewal, is shared by archaeologists, who despite its being a burial site, have also associated Maeshowe and its rituals with the agricultural year. Neolithic people were not nomadic but had gradually become settled farmers, living by the routines and rhythms of the seasons, which, according to Richards, constituted &ldquo;an analogy with the human life cycle and past generations&rdquo; (&ldquo;Doorways&rdquo; 65). Time&rsquo;s passage was the organizational framework for survival as well as mortality, and the tombs, he writes, were &ldquo;a metaphorical extension of daily life&rdquo; (&ldquo;Doorways&rdquo; 76). Trevor Garnham, an architect, develops that idea further: &ldquo;Burying bones in the earth was perhaps to seek some metaphoric relationship with the planting of seeds. In its maturity and death, the seed containing the essence of its own renewal served as the inspiration for the hope of life&rsquo;s rebirth in some other form&rdquo; (87). In pairing skeletal remains with seeds as an expression of hope for the future, Garnham&rsquo;s analogy is comparable to the positive final image of Brown&rsquo;s poem, the &ldquo;skulls&rdquo; engendering the &ldquo;green shoots&rdquo; and the &ldquo;bread&rdquo; of life.</p>
+ <p>[8] Brown had written earlier of Maeshowe in his 1996 poem, &ldquo;Maeshowe: Midwinter,&rdquo; choosing then to focus on the solstice. However, the imagery here is not rooted in the agricultural cycle, the earthly world of querns, ploughs, and bread; instead, he connects the pre-Christian tomb to the Christian calendar. The opening phrase, &ldquo;Equinox to Hallowmass,&rdquo; immediately integrates the astronomical with the sacred, giving the season of &ldquo;darkness&rdquo; both physical and spiritual dimensions (1). The religious imagery continues in the second stanza as it evokes &ldquo;St Lucy,&rdquo; whose feast day falls on the shortest day of the year (6). She is portrayed as a weaver whose &ldquo;shuttle&rdquo; creates &ldquo;a dark web&rdquo; that &ldquo;fills the loom&rdquo; (7-9), placing at the centre of the poem a world in which light is completely absent: &ldquo;The blackness is solid as a / stone that locks a tomb. / No star shines there&rdquo; (10-12). To be in such a void, with no guiding star, would seem like a moment of psychological despair, yet just as the days begin to lengthen immediately after the solstice, the poem also brightens. The moment when the sun enters the passage is the &ldquo;true ceremony,&rdquo; suggesting that perhaps the pagan reverence for nature carries particular authenticity. Then &ldquo;the last fleeting solstice flame&rdquo; is &ldquo;caught up,&rdquo; leading to an optimistic note as the children&mdash;the future&mdash;sing with &ldquo;voices like leaves of light&rdquo; (19). Again, the poem ends with an image of rebirth, but its tone is less biological and more cosmological.</p>
+ <p>[9] While Brown&rsquo;s poems use these dual frames of reference in order to explore the themes of regeneration that Maeshowe expresses, the biological and cosmological are not at odds. Garnham defines the cosmos as &ldquo;an all-encompassing world of things and phenomena [. . . .] The essential character of this early form of cosmos bound every aspect of a people&rsquo;s life into reciprocal relationships with the forces that give shape to their world&rdquo; (9). The central argument of his book places Neolithic Orkney in this context. Similarly, reading Brown&rsquo;s two Maeshowe poems together reveals that the &ldquo;green shoot&rdquo; which produces the &ldquo;bread&rdquo; corresponds to the youthful &ldquo;voices like leaves of light.&rdquo; In fact, his insertion of &ldquo;leaves,&rdquo; with its agrarian connotations, into that final line establishes the connection, recognizes that the complex architectural system of domestic houses, burial chambers, and stone circles symbolizes the idea that the activities for which they were designed&mdash;working, eating, loving, sleeping, worshipping, dying, and the possibility of rebirth&mdash;are the web of human existence. The physical bread and the metaphysical song are one.</p>
+ <p>[10] In their respective responses to Maeshowe, Kathleen Jamie, Myra Schneider, and Dilys Rose also address the theme of the cycle of life and death. Jamie&rsquo;s essay, &ldquo;Darkness and Light,&rdquo; describes a quest: she seeks a good, positive darkness because, in the 21st century, it has become impossible &ldquo;to see the real dark for the metaphorical dark . . .the death-dark.&rdquo; Enjoyment of the &ldquo;natural, courteous dark,&rdquo; she has come to believe, has been squeezed out by the Christian belief in a metaphorical darkness that stands for the opposite of salvation (9-10). However, as she is planning this trip, a friend points out that &ldquo;Maes Howe is a metaphor,&rdquo; perhaps exposing a flaw in Jamie&rsquo;s thinking: possibly the natural and metaphorical darknesses are inseparable (10 emphasis added). Although her visit to Maeshowe takes place a couple of days before the solstice, the artificial lights of a surveyor&rsquo;s crew assault her eyes, so she rediscovers no &ldquo;courteous darkness&rdquo; and witnesses &ldquo;no resurrecting beam of sunlight&rdquo; (19). Nevertheless, through Maeshowe, she becomes reconciled to the conventional negative concept of darkness. In terms of &ldquo;wonder&rdquo; similar to Brown&rsquo;s in <u>The Orcadian</u>, she asks, &ldquo;Were they the first people . . . to articulate this metaphor of light and dark, of life and death?&rdquo; and reflects upon its significance:</p><blockquote>For five thousand years we have used darkness as the metaphor of our mortality. We were at the mercy of merciless death, which is darkness. When we died, they sent a beam of midwinter light in among our bones. What a tender, potent gesture. In the Christian era, we were laid in our graves to face the rising sun. We&rsquo;re still mortal, still don&rsquo;t want to die, don&rsquo;t want our loved ones to die. (19-20)
+ </blockquote>
+ <p>Her rejection of a metaphor that she has considered &ldquo;[worn] out&rdquo; and &ldquo;redundant&rdquo; (4, 9) turns out to have been less literary and more personally psychological, for Jamie&rsquo;s visit to the tomb leads to her acceptance of mortality. Whereas previously she has blamed Christianity, she now appreciates that the Christian concept of darkness is part of a continuum of dread traceable back to Neolithic times and forward to our own. The &ldquo;tender, potent gesture&rdquo; of the light penetrating the dark of the tomb, therefore, offers consolation, ameliorating our most profound fears (20).</p>
+ <p>[11] In her poem, &ldquo;Maeshowe,&rdquo; Myra Schneider also describes a guided tour of the cairn, during which the speaker uses the second person singular to address a hypothetical visitor, initially giving the sense that to enter the burial place feels like death as the &ldquo;chill seeps into your body&rdquo; (14). However, this ominous impression is immediately dismissed because &ldquo;a stillness that&rsquo;s other than death inhabits / this place where the undead gather to greet the dead&rdquo; (15-17). The journey through the passage will take &ldquo;you&rdquo; to a place that is not oblivion but, instead, is where the living may consort with their ancestors. Again, the boundary between life and death, which can seem so irrevocable, becomes less absolute and, therefore, less threatening. After the visit is over, its impact will remain, and the speaker imagines her visitor&rsquo;s memories:</p><blockquote>In midwinter you&rsquo;ll visualize the sun piercing the dark that swaddles seeds, see it falling on the aligned entrance, its white shine splitting to burnish the passage wall, flood the ground with gold. (22-26)
+ </blockquote>
+ <p>These images recall Garnham&rsquo;s theory: that the burial of bones is connected metaphorically to the planting of seeds. In the speaker&rsquo;s memory, the dark cradles seeds, the germ of life, rather than bones. Once sunlight enters the tomb, a radiant moment occurs in which the &ldquo;ground&rdquo; will turn &ldquo;gold,&rdquo; like a field of ripe grain. Schneider&rsquo;s poem, like Brown&rsquo;s, affirms the archaeological reading of Maeshowe as a place of renewal, but in this case that renewal goes beyond the promise of the agricultural cycle. An individual will be able to experience, perhaps during times of psychological or spiritual gloom, the moment of glory when the sun is &ldquo;piercing / the dark.&rdquo; There is a Romantic quality to these lines: Maeshowe will stay with Schneider&rsquo;s speaker as those daffodils stay with Wordsworth, &ldquo;to flash upon the inward eye / That is the bliss of solitude,&rdquo; to stimulate the imagination (24). Having herself benefited from the tomb&rsquo;s restorative qualities, the speaker is inspired to spread the word, to share her revelation with &ldquo;you,&rdquo; the reader.</p>
+ <p>[12] Besides the drama of the solstice, another inspirational feature of Maeshowe is the Viking runes carved on the interior walls. Referring to these inscriptions as &ldquo;The first island poems,&rdquo; Brown quotes them emphatically in the second of the paired poems: &ldquo;INGIBIORG IS THE LOVELIEST GIRL / HERMUND WITH A HARD AXE CARVED RUNES&rdquo; (&ldquo;Two&rdquo; 13, 18-19). Many have been struck by the simple humanity of these statements, as well as the paradox inherent in this lusty youthful scrawling being hidden in a tomb. Dilys Rose, in &ldquo;Maeshowe Nipple,&rdquo; for instance, lists the prosaic concerns of the Vikings, portraying them as &ldquo;intrepid&rdquo; but also homesick, missing &ldquo;sweethearts and family&rdquo; (4, 9). At the ends of their respective poems, both Brown and Rose emphasize that Maeshowe was merely a temporary shelter for the Vikings: the &ldquo;young seamen climbed out of Maeshowe, / Their nostrils wide to the salt wind&rdquo;; &ldquo;the dragon boats moved on&rdquo; (Brown &ldquo;Two&rdquo; 23-24; Rose 11). Crawling out of the subterranean tomb and heading for further maritime adventures, the men re-enter the world, extending the overall theme of regeneration. Brown, as we have seen, has already linked the tomb with the life-giving promise of &ldquo;the first green shoot, the full ear, then the bread&rdquo; in the first of these paired poems. Rose, in similar terms, also connects the Viking runes with the reassuring knowledge that there will be a crop next year: over the centuries, &ldquo;their tongue / took root and sprouted from invaded soil / green words for <u>Father</u>, <u>Daughter</u>, <u>Bread</u>&rdquo; (11-13). Here, in the final lines, the Viking vocabulary is fresh and verdant, a harbinger of new human life and the grain that nourishes it. Since runic characters are &ldquo;straight-branched&rdquo; (Rose 4), they resemble rows of rudimentary skeletal stick figures which have been buried in the tomb. The bony runes, therefore, have become metaphorical seeds, and Rose&rsquo;s speaker, like Garnham, sees hope in the bone/seed analogy.</p>
+ <p>[13] It is clear, to summarize briefly, that these four creative writers read Maeshowe much as archaeologists and historians of architecture have done, as an expression of hope for the future, particularly in relation to the coming of spring, but also at a more personal level. The texts suggest that to visit these tombs is, as Richards also emphasizes, transformative. Like their ancestors, contemporary visitors are changed, in some manner revitalized, especially if they witness the sun&rsquo;s midwinter alignment, which Brown describes as a &ldquo;pledge of renewal, a cry of resurrection&rdquo; (&ldquo;Maeshowe in Midwinter&rdquo; 88). However, in the work of Jamie, Schneider, and Rose, a further, more political restoration is at work, for all three use images equating Maeshowe with the female body.</p>
+ <p>[14] Kathleen Jamie states early in her essay, &ldquo;We are conceived and carried in the darkness,&rdquo; emphasizing the positive, life-giving qualities of the dark, and inviting the reader to see Maeshowe as a uterus (4). The womb/tomb imagery is developed further when she eroticizes the winter solstice as &ldquo;a complicit kiss,&rdquo; during which &ldquo;the beam of the setting sun shines along the passage, and onto the tomb&rsquo;s back wall&rdquo; (12). When she goes inside the tomb, she expects &ldquo;not utter darkness, but perhaps a wombish red&rdquo;; however, this is denied her because of the lights of the surveyors, one of whom is &ldquo;folded, foetus-like, into the little cell in the back wall&rdquo;: a foetus implanted in the very place where the sunbeam strikes (12,13). When Jamie leaves, she describes taking &ldquo;the smallest and most challenging of journeys, squeezing down a passageway and out into the world of sound and moving air&rdquo; (17). The tunnel that admits the beam has become a birth canal, so Jamie&rsquo;s transformation is not only her intellectual reassessment of the metaphorical value of darkness; she visualizes her own rebirth in more literal terms too, with Maeshowe cast as the mother.</p>
+ <p>[15] Myra Schneider&rsquo;s &ldquo;Maeshowe&rdquo; also hints that to visit the tomb is to return to the womb when the speaker remarks that although &ldquo;you&rdquo; are part of a tour group, you will realize that you are &ldquo;alone&rdquo; and have &ldquo;never travelled so far back / so far in&rdquo; (8-10). This analogy is made more explicit later in the poem when the sun enters the passage: &ldquo;In that deep chamber / you&rsquo;ll be bathed in red, not the red spilt in hatred&mdash;/the red that&rsquo;s birth, the heart looming with the blood&rdquo; (24-28). In the vision that the speaker evokes for the visitor&rsquo;s memory, therefore, the &ldquo;dark that swaddles seeds&rdquo; not only nurtures and protects the grain that will ripen into crops, but also the fertilized ovum (23). With no dazzling and intrusive surveyors&rsquo; lights, Schneider suggests that it is possible for us to experience the &ldquo;wombish red&rdquo; that was denied Jamie, blood that is the force of life rather than the mark of violence.</p>
+ <p>[16] Dilys Rose&rsquo;s poem, &ldquo;Maeshowe Nipple,&rdquo; on the other hand, in addressing the Viking use of the tomb, acknowledges that violence has taken place. The title, of course, immediately signals that Maeshowe is female, and the opening lines graphically describe the tomb&rsquo;s external anatomy: a &ldquo;breast,&rdquo; with an &ldquo;aureola / sandy-rimmed, the nipple leaking a pale trail / to hidden chambers&rdquo; (1-3). Within, Maeshowe&rsquo;s chambers have been &ldquo;invaded&rdquo; by men who &ldquo;inscribed their conquests&rdquo; and &ldquo;totted up the loot&rdquo; (12, 4, 6). Even though the poem has initially compared the cairn to a breast rather than a womb, this seems like a rape or an assault by men exercising their power and keeping track of their plunder. As human and homesick as the poem presents the young men, it does not forget that their presence in Maeshowe is as uninvited intruders who leave their runic seeds carved into the chamber walls.</p>
+ <p>[17] To make sense of this pattern of imagery, it is helpful to turn to an earlier female author, similarly inspired by her visit to a Neolithic site. Naomi Mitchison wrote <u>Early in Orcadia</u> after a friend took her to another of Orkney&rsquo;s chambered tombs, Isbister, which has no passage entry, because &ldquo;she knew it would waken something in me&rdquo; (8). Set in Neolithic times, the novel follows a family and its descendants as they settle on Orkney, establish homes and villages, and erect the monuments in which they practice their religious rituals. Mitchison depicts the cairns predating the stone circles (both Isbister and Maeshowe are, in fact, thought to have been built before Brodgar) and imaginatively describes the changing beliefs prompting these architectural developments. Tradition holds that pregnant women must visit the tomb in order that the ancestral spirit will be passed to their children (132). One woman, Ba, making this journey, reflects that a &ldquo;few moons&rdquo; have passed since she became pregnant and stopped menstruating. She also knows that a powerful goddess, &ldquo;the big bad Moon Woman had once had an honouring place,&rdquo; had watched over the dead (119). However, the Moon Woman has been supplanted by the sun. The burial place was &ldquo;pulled apart and scattered by the Sun Man and the bulls. After that came the beginning of their own honouring place where the bones lay and where you must go down on your knees before you could get in&rdquo; (119). The later passage cairn, then, is a creation of the masculine sun, the same sun that shines down the passageway at midwinter. Accompanied by bulls, also male, the Sun Man has ravaged the Moon Woman&rsquo;s tomb and designed a new one to suit his own needs. Even so, the burial place is still associated with female fertility. Nervously, Ba enters &ldquo;on her hands and knees . . . under and between great stones.&rdquo; Once inside, though, she thinks of the moments before she conceived her child: &ldquo;She was waiting, almost as she had waited in the soft sand behind that rock in the sun-warmed geo a few moons back&rdquo; (130). For Ba, the tomb is not frightening. She recalls not a violent rape, but a loving encounter, and the darkness feels as warm as the &ldquo;geo&rdquo; (an Orcadian word referring to a deep, narrow fissure in a cliff) where she met her lover. Following her memory of the moment of conception, she is &ldquo;push[ed] . . . back, back to the way out, back to the square of light, to the way out into the real world on hands and knees as one must&rdquo; (130). Like Jamie, Ba is compelled to crawl, to battle her way through the passage to be reborn.</p>
+ <p>[18] By the end of <u>Early in Orcadia</u>, the stone circle, with its emphasis on light rather than dark, is becoming the ultimate manifestation of the transfer of power from the Moon Woman to the Sun Man. Its significance is explained by the &ldquo;Great Man,&rdquo; who is &ldquo;painted with sun circles,&rdquo; to Moon Woman after he has summoned her to his presence: &ldquo;The great tall stones . . . were so raised to show the way of the sun, who is our master and our maker&rdquo; (169). Moon Woman, however, is aware of the injustice of this arrangement: &ldquo;They said that the moon was the servant of the sun, to do what he wanted, but that, Moon Woman knew, was not right. In her own mind she unsaid it&rdquo; (170). At first she is jealous and afraid, but the final vision of the novel is hers, and it is, to an extent, a reconciliation of powers:</p><blockquote>If I were to say a few small and easy words to the Great Man, if I were to move myself in a certain way, then we would be sun and moon. Then I would put my fingers onto the colour, onto that knife, onto his eyes, . . . eyes, onto that round, shining sun that hangs over his heart, fingering it so that my fingers would meet his, me going . . . onto all parts of him. He would be mine as the sun is the moon&rsquo;s. (176)
+ </blockquote>
+ <p>She is picturing an intertwining of sun and moon, of masculine and feminine&mdash;a consummation. The partnership is not one of complete equality, though, for she also envisions not that the sun will be the master and the moon the servant, but that he will be hers, that the moon will possess the sun, that her status will be restored.</p>
+ <p>[19] Mitchison&rsquo;s fictional representation of light/sun/man emerging as the object of worship and awe, assuming the rank previously held by dark/moon/woman, is an idea rooted across cultures: &ldquo;A fundamental polarity in many creation myths,&rdquo; according to Trevor Garnham, &ldquo;contrasts the dark, fecund, harbouring earth with the up-drawing sun.&rdquo; (145). He points out, for example, that &ldquo;by the time of the Celtic occupation of Britain, there were well-established beliefs and practices focused on the sun&rdquo; and that in Norse mythology, &ldquo;a male hierarchy supplanted older, matriarchal law&rdquo; (161, 109). Analyzing the archaeological sites within this paradigm, Garnham argues, supports the theory that religious practice fundamentally changed along with the architecture, that &ldquo;ritual activity associated with burial cairns became transferred to stone circles&rdquo; (152).</p>
+ <p>[20] Maeshowe, however, suggests a mid-point in this ritualistic shift because although, like earlier stalled cairns, it is dark and womb-like, its annual climactic moment is when the sun lights up the passage. Garnham sees the Neolithic architecture of Orkney as a progression. The first structures, the houses, were purely domestic; they had a &ldquo;nurturing role&rdquo; (66). The houses at the coastal village site, Scara Brae, therefore, &ldquo;seem to be fundamentally powerful symbols of protection and gathering, echoing that of the pot and the basket&rdquo; (70). Since the manufacture of both pots and baskets was the work of women, Garnham is reading the houses as essentially feminine. They were vessels, their stone walls embanked by earth. Both Garnham and Richards point out that the houses were models for the tombs: the passage graves are structurally similar to the houses at Scara Brae, and both were covered with turf (Garnham 48; Challands, Muir &amp; Richards 242, 245). Cairns of the Maeshow type, with passage entries, however, were the later forms. The earlier stalled structures, such as Midhowe, on the island of Rousay, did not feature the tunnel entrance.</p>
+ <p>[21] Archaeologists do not agree on the social significance of passage cairns and sun circles, the extent to which their development reveals a move to a more hierarchical society. Challands, Muir, and Richards state, &ldquo;In many ways, everything about the architecture of Maeshowe enforces a notion of separation, division, and restriction&rdquo; (247). Elsewhere, Richards and another co-writer are more guarded. They point out that the tomb resembles House 2 at the nearby Barnhouse settlement, a larger house than any at Scara Brae that was probably &ldquo;highly restricted on the basis of an individual&rsquo;s status, probably additionally defined in terms of age and gender.&rdquo; However, they also warn that there is insufficient archaeological evidence to &ldquo;leap to conclusions about a patriarchal group of &lsquo;elders&rsquo; who used knowledge as a commodity to maintain their power over women and younger men&rdquo; (Muir &amp; Richards 204). Although cautious, they do acknowledge that &ldquo;power and authority,&rdquo; probably based on &ldquo;cosmological beliefs,&rdquo; would have been necessary to build the monuments (199). Leaning not only on physical but also anthropological evidence, Garnham&rsquo;s view, on the other hand, is that the more formal structure <u>does</u> support the idea of hierarchy and that the estimated 100,000 man/hours that would have been necessary to build it point to a more complex social structure that had to extend beyond the local community (128). Furthermore, he writes, the layout of individual chambers &ldquo;can be read as a metaphor of primogeniture&rdquo; (74). Like Richards, Garnham interprets the passage as a symbol of privilege because it was hard to get inside. However, citing Eliade&rsquo;s <u>Patterns in Comparative Religion</u>, he also emphasizes that there is &ldquo;a close connection between solar theology and the elite&rdquo; (163). In this context it seems that &ldquo;allowing access to the sun . . . was more important that [sic] allowing access to members of the tribe&rdquo; (131-132).</p>
+ <p>[22] Maeshowe can be seen, then, as expressing a point of tension between earth and sun in which the dark tomb is literally infiltrated by solar rays on one day only. The subsequent building of the Circle of Brodgar elevates the stature of the sun. Fully above ground, the center of its astronomical and religious year occurs not in December, but in June, at the midsummer solstice. Garnham points out that while a smaller circle, the Stones of Stenness, is open to the sun at its &ldquo;point of maximum power,&rdquo; Maeshowe allows the sun inside only when it is &ldquo;at its lowest ebb.&rdquo; Except at midwinter, &ldquo;the tomb is dark, cold, and filled with white bones, echoing the whiteness of the moon&rdquo; (207). Although Stenness actually predates Maeshowe by perhaps 400 years, throwing off the neat chronology of <u>Early in Orcadia</u>, Garnham&rsquo;s interpretation of Maeshowe and the stone circles parallels Mitchison&rsquo;s literary response to the Isbister tomb: compared to earlier cairns, Maeshowe is a more patriarchal development, the passageway allowing the masculine sun to displace the feminine &ldquo;whiteness of the moon,&rdquo; and yet the bones, the metaphorical seeds, still lie dormant; the presence of Moon Woman endures.</p>
+ <p>[23] Although <u>Early in Orcadia</u> ends with Moon Woman&rsquo;s vision of a mingling of sun and moon, of masculine and feminine, there is a note of uncertainty as she asks herself, &ldquo;Should I, then?&rdquo; (176). She does not ask &ldquo;Can I?&rdquo; but &ldquo;Should I?&rdquo; Her question is not whether she is personally capable, but whether it would be wise to challenge the elite power structure in the name of justice. Readers are left without an answer, but since women are still fighting for equality in the institutions of politics and religion, it is reasonable to assume that if Moon Woman did attempt it, she met with a great deal of resistance. It is with this in mind, then, that we can return to the Maeshowe experiences of Jamie, Schneider and Rose. Their visits to the cairn suggest that to see it merely as a symbol of agricultural regeneration or even more broadly of hope, is incomplete. Something more needs to be resurrected, and their use of the female imagery effectively acknowledges and reclaims a feminine narrative for Maeshowe. In Rose&rsquo;s poem, 12th century Vikings may take up residence inside, but 900 years later, the reader is instructed to &ldquo;See,&rdquo; to bear witness to &ldquo;a green breast in a green field,&rdquo; the most nurturing part of a woman&rsquo;s body surrounded by the new growth of spring (1). When Schneider refers to the &ldquo;red that&rsquo;s birth&rdquo; rather than the &ldquo;red spilt in hatred,&rdquo; and describes how the sun will &ldquo;burnish the passage wall, / flood the ground with gold&rdquo; and, similarly, when Jamie refers to the &ldquo;complicit kiss,&rdquo; it is as if Moon Woman&rsquo;s consummation has finally taken place and justice restored.</p>
+ <p>[24] Richards asks where the doors of tombs lead, to what &ldquo;revelation.&rdquo; Indeed, the creative writing of Jamie, Schneider, and Rose transports readers through Maeshowe&rsquo;s entryway towards &ldquo;revelation.&rdquo; Their collective responses help us to recognize the humanity of Neolithic peoples, to appreciate how common experiences connect us to the past. They ask us to consider the roots of sexual discrimination, the possible marginalization of women 5000 years ago. More universally, they honor the memory of displaced matriarchal societies and, thus, prompt us to reflect on the status of women today. While, as Hall points out, male authors of the mid-twentieth-century Scottish Literary Renaissance had a nationalist political agenda, &ldquo;looking for Scotland in Scotland&rsquo;s prehistory&rdquo; (160), these female writers look to the past for a feminist renewal, both personal and political. As such, their interpretations complement and illuminate those of archaeologists. Naomi Mitchison, acknowledging that she may be &ldquo;treading on the toes of archaeologists,&rdquo; points out that their physical &ldquo;evidence may not always offer a clear interpretation, in fact it very seldom does&rdquo; (113). For despite their painstaking sifting (both literal and figurative) of physical evidence, archaeologists must, finally, apply their own imaginations.</p>
+ <p>[25] Archaeologists themselves recognize the uncertainty inherent in drawing conclusions about ancient societies from the surviving fragments of their lives. In reference to the recent discovery of a complex of temples at the Ness of Brodgar, Richards has said, &ldquo;This was a ceremonial centre, and a vast one at that. But the religious beliefs of its builders remain a mystery&quot; (qtd. in McKie). In fact, the excavation of this temple complex is prompting a reassessment of the entire Heart of Neolithic Orkney. Tom Muir, of the Orkney Museum, goes so far as to assert that &quot;the whole text book of British archaeology for this period will have to be torn up and rewritten from scratch thanks to this place&quot; (qtd. in McKie). Even as archaeologists, using sophisticated technology, scrape away the dust of time from this long-buried site, it remains true that &ldquo;Insights can only come from interpretation&rdquo; (Jones and Richards 195). It is in this interpretative arena that science must join forces with the arts and humanities in the search for knowledge, for a fuller understanding.</p>
+ <p>[26] George Mackay Brown has written, &ldquo;People in 2000 AD are essentially the same as the stone-breakers [. . .] of 3000 BC&rdquo; (&ldquo;Brodgar Poems&rdquo; lines 10-12). Knowing where we have come from, fleshing out our understanding of the prehistoric world and, therefore, ourselves, takes the skills and multiple perspectives not only of scientists, archaeologists, architects, and anthropologists, but also essayists, poets, and more. The interdisciplinary synergy involved in comparing archaeological, anthropological, and literary interpretations of Maeshowe sheds light on the shadows of the past, raises questions about the more elusive shadows of Neolithic women, and provides historical context for our understanding of gender relations across time. Like crawling through the passage into the dark and out to the light, the empirical and literary journeys into the mysteries of Maeshowe are indeed transformative, exhuming the bones of the past that we may better nurture the seeds of the future.</p>
+ <p>ACKNOWLEDGEMENTS. Thanks are due to Edward Gale Agran, Stephen Potthoff, and the anonymous reviewers for their time and valued advice. </p>
+ <p align="center">WORKS CITED</p>
+ <p>Bevan, Archie, and Brian Murray. Eds. <u>The Collected Poems of George Mackay Brown</u>. London: John Murray, 2005. Print.</p>
+ <p>Brown, George Mackay. &ldquo;Brodgar Poems (1992).&rdquo; In Bevan and Murray.308-312. Print.</p>
+ <p>---. &ldquo;Maeshowe: Midwinter.&rdquo;1996. In Bevan and Murray. 320. Print.</p>
+ <p>---. &ldquo;Maeshowe at Midwinter.&rdquo; 1977. <u>Under Binkie&rsquo;s Brae</u>. Edinburgh: Gordon Wright Publishing, 1979. 87-88. Print.</p>
+ <p>---. &ldquo;Two Maeshowe Poems.&rdquo; 1999. In Bevan and Murray. 420-421. Print.</p>
+ <p>Card, Nick, et al. &ldquo;Bringing a Landscape to Life? Researching and Managing &lsquo;The Heart of Neolithic Orkney&rsquo; World Heritage Site.&rdquo; <u>World Archaeology</u> 39.3 (2007): 417-435. EBSCO <u>Academic Search Complete</u>. Web. 29 Jun. 2011.</p>
+ <p>Challands, Adrian, Tom Muir, and Colin Richards. &ldquo;The Great Passage Grave of Maeshowe.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 229-248. Print.</p>
+ <p>Crawford, Robert. &ldquo;Maes Howe Sappho.&rdquo; <u>Yale Review</u>: 95.1 (2007): 60-65. OhioLINK Electronic Journal Center. Web. 29 Jun. 2011.</p>
+ <p>Garnham, Trevor. <u>Lines on the Landscape, Circles from the Sky: Monuments of Neolithic Orkney</u>. Stroud, Gloucestershire: Tempus, 2004. Print.</p>
+ <p>Hall, Simon W. <u>The History of Orkney Literature</u>. Edinburgh: John Donald/Birlinn Ltd., 2010. Print.</p>
+ <p>&ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; Historic Scotland. 2008. EBSCO <u>Academic Search Complete</u>. Web. 30 Jun. 2011.</p>
+ <p>Jamie, Kathleen. &ldquo;Darkness and Light.&rdquo; <u>Findings: Esssays on the Natural and Unnatural World</u>. Ed. Jamie. St. Paul, MN: Graywolf, 2005. 3-22. Print.</p>
+ <p>McKie, Robin. &ldquo;Neolithic Discovery: Why Orkney is the Centre of Ancient Britain.</p>
+ <p><u>The Guardian / The Observer</u>. 6 Oct. 2012. Web. 16 Mar. 2013.</p>
+ <p>Mitchison, Naomi. <u>Early in Orcadia</u>. Glasgow: Richard Drew, 1987. Print.</p>
+ <p>Jones, Si&acirc;n, and Colin Richards. &ldquo;The Villagers of Barnhouse.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 195-204. Print.</p>
+ <p>Richards, Colin. &ldquo;Doorways into Another World: The Orkney-Cromarty Chambered Tombs.&rdquo; <u>Vessels for Ancestors: Essays on the Neolithic of Britain and Ireland in Honour of Audrey Henshall</u>. Ed. Niall Sharples and Alison Sheridan. Edinburgh: Edinburgh UP, 1992. 62-76. Print.</p>
+ <p>Riddoch, Lesley. &ldquo;Stone Age Marvels Which Inspire and Astonish: Wonders of Scotland.&rdquo; <u>The Scotsman</u>. 13 Feb. 2006. Web. 30 Jun. 2011.</p>
+ <p>Rose, Dilys. &ldquo;Maes Howe Nipple.&rdquo; <u>Bodywork</u>. Edinburgh. Luath Press, 2007. Print.</p>
+ <p>Schneider, Myra. &ldquo;Maeshowe.&rdquo; <u>Circling the Core</u>. London: Enitharmon Press, 2008. 23-24. Print.</p>
+ <p>Wordsworth, William. &ldquo;I wandered lonely as a cloud.&rdquo; <u>The Norton Anthology of English Literature</u>. Eighth Ed. Ed. Stephen Greenblatt and M.H. Abrams. New York: Norton, 2006. 305-306. Print.</p>
+<p><strong>Contributor's Note</strong></p>
+ <p><strong>CHARLOTTE FAIRLIE</strong> teaches English at Wilmington College, in Wilmington, Ohio. Her published work focuses on Scottish literature and rural life in literature. She is currently co-editing an anthology of poetry relating to scythes and mowing.</p></td>
+ <td valign="top"><center>
+ <a href="../index.html"> <img src="../image/btncu.gif" alt="Current Issue" border="0" height="42" width="79"></a><br>
+ <a href="../download.html" tppabs="http://www.genders.org/download.html"> <img src="../image/btndo.gif" alt="Download" tppabs="http://www.genders.org/image/btndo.gif" align="bottom" border="0" height="42" width="115"></a><br>
+ <a href="../edit.html" tppabs="http://www.genders.org/edit.html"> <img src="../image/btned.gif" alt="Editorial Board" tppabs="http://www.genders.org/image/btned.gif" align="bottom" border="0" height="50" width="80"></a><br>
+ <a href="../guide.html" tppabs="http://www.genders.org/guide.html"> <img src="../image/btngu.gif" alt="Contributor Guidelines" tppabs="http://www.genders.org/image/btngu.gif" align="bottom" border="0" height="42" width="90"></a><br>
+ <a href="../recent.html"> <img src="../image/btnre.gif" alt="Recent Issues" tppabs="http://www.genders.org/image/btnre.gif" align="bottom" border="0" height="41" width="79"></a><br>
+ <a href="../link.html"> <img src="../image/btnli.gif" alt="Links &amp; Books" border="0" height="46" width="97"></a><br>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <table width="500">
+ <tbody>
+ <tr>
+ <td><p><a href="../download.html">Copyright</a> ©2010 Ann Kibbey.
+
+ All Rights Reserved Worldwide.<br>
+ </p>
+ <p> </p>
+ <center>
+ <a href="../download.html"><font size="1">Download</font></a><font size="1"> || <a href="../edit.html">Editorial Board</a> || <a href="../guide.html">Submission
+
+ Guidelines</a> || <a href="../index.html">Current Issue</a> || <a href="../recent.html">Recent Issues</a> || <a href="../link.html">Links
+
+ &amp; Books</a></font>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <p></p>
+ <p align="right">
+
+ <table width="550">
+ <tbody>
+ <tr>
+ <td width="361"></td>
+ <td width="72"><p><img src="../image/algosmlr.gif" alt="Genders" align="bottom" border="0" height="72" width="72"> </p></td>
+ <td width="101"><b> <font size="1">Genders Journal</font></b> <font size="1"><br>
+ 226 UCB<br>
+ University of Colorado<br>
+ Boulder, CO 80309<br>
+ http://www.Genders.org</font></td>
+ </tr>
+ </tbody>
+ </table>
+ </p>
+ <p align="right"></p></td>
+ </tr>
+ </tbody>
+</table>
+</p>
+<p></p>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/nature_article.html b/python/tests/files/nature_article.html
new file mode 100644
index 0000000..177da83
--- /dev/null
+++ b/python/tests/files/nature_article.html
@@ -0,0 +1,1379 @@
+
+
+
+
+
+
+
+
+<!DOCTYPE html>
+<html lang="en" class="grade-c">
+<head>
+ <meta charset="utf-8">
+<link rel="dns-prefetch" href="//ajax.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.gstatic.com"/>
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
+
+ <title>More than 100 scientific journals have disappeared from the Internet</title>
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+ <meta property="og:type" content="article"/>
+ <meta property="og:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta property="og:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+ <meta name="twitter:card" content="summary_large_image"/>
+ <meta name="twitter:site" content="@nature"/>
+ <meta name="twitter:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta name="twitter:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta name="twitter:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+
+
+ <meta name="journal_id" content="41586"/>
+
+ <meta name="dc.title" content="More than 100 scientific journals have disappeared from the Internet"/>
+
+ <meta name="dc.source" content="Nature 2020"/>
+
+ <meta name="dc.format" content="text/html"/>
+
+ <meta name="dc.publisher" content="Nature Publishing Group"/>
+
+ <meta name="dc.date" content="2020-09-10"/>
+
+ <meta name="dc.type" content="News"/>
+
+ <meta name="dc.language" content="En"/>
+
+ <meta name="dc.copyright" content="2020 Nature"/>
+
+ <meta name="dc.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="dc.description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="prism.publicationName" content="Nature"/>
+
+ <meta name="prism.publicationDate" content="2020-09-10"/>
+
+ <meta name="prism.section" content="News"/>
+
+ <meta name="prism.startingPage" content=""/>
+
+ <meta name="prism.endingPage" content=""/>
+
+ <meta name="prism.copyright" content="2020 Nature"/>
+
+ <meta name="prism.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="prism.url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+
+ <meta name="prism.doi" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="dc.identifier" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="DOI" content="10.1038/d41586-020-02610-z"/>
+
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="dc.creator" content="Diana Kwon"/>
+
+ <meta name="dc.subject" content="Publishing"/>
+
+
+
+<script>(function(e){var t=e.documentElement,n=e.implementation;t.className='js';if(n&&n.hasFeature('http://www.w3.org/TR/SVG11/feature#Image','1.1')){t.className+=' svg'}})(document)</script>
+<link rel="stylesheet" href="/static/css/mosaic-grade-c.26f07b2f11.css">
+
+<link rel="stylesheet" class="js-ctm" href="/static/css/magazine-mosaic-150.7f46c29843.css" media="only screen, print and (-webkit-min-device-pixel-ratio:0) and (min-color-index:0), (-ms-high-contrast: none), only all and (min--moz-device-pixel-ratio:0) and (min-resolution: 3e1dpcm)">
+
+
+ <style>
+ .c-header--brand-border {
+ border-bottom: 5px solid #000;
+ }
+ </style>
+
+<link rel="apple-touch-icon" sizes="180x180" href=/static/images/favicons/nature/apple-touch-icon.f39cb19454.png>
+<link rel="icon" type="image/png" sizes="32x32" href=/static/images/favicons/nature/favicon-32x32.3fe59ece92.png>
+<link rel="icon" type="image/png" sizes="16x16" href=/static/images/favicons/nature/favicon-16x16.951651ab72.png>
+<link rel="manifest" href=/static/manifest.1a481c42b1.json>
+<link rel="mask-icon" href=/static/images/favicons/nature/safari-pinned-tab.69bff48fe6.svg color="#000000">
+<link rel="shortcut icon" href=/static/images/favicons/nature/favicon.62367f778b.ico>
+<meta name="msapplication-TileColor" content="#000000">
+<meta name="msapplication-config" content=/static/browserconfig.e35b3b052c.xml>
+<meta name="theme-color" content="#000000">
+<meta name="application-name" content="Nature">
+
+<link rel="search" href="http://www.nature.com/search">
+<link rel="search" href="http://www.nature.com/opensearch/opensearch.xml" type="application/opensearchdescription+xml" title="nature.com">
+<link rel="search" href="http://www.nature.com/opensearch/request" type="application/sru+xml" title="nature.com">
+
+ <meta name="WT.cg_s" content="News"/>
+ <meta name="WT.z_cg_type" content="News"/>
+ <meta name="WT.page_categorisation" content="Article page"/>
+ <meta name="WT.z_subject_term" content="Publishing"/>
+
+<meta name="WT.template" content="oscar"/>
+<meta name="WT.cg_n" content="Nature"/>
+<meta name="dc.rights" content="©2020 Macmillan Publishers Limited. All Rights Reserved."/>
+<meta name="WT.z_bandiera_abtest" content="a"/>
+
+ <script data-test="dataLayer">
+ dataLayer = [{"content":{"category":{"contentType":"news","legacy":{"webtrendsPrimaryArticleType":"news","webtrendsSubjectTerms":"publishing","webtrendsContentCategory":null,"webtrendsContentCollection":null,"webtrendsContentGroup":"Nature","webtrendsContentGroupType":null,"webtrendsContentSubGroup":"News"}},"article":{"doi":"10.1038/d41586-020-02610-z"},"attributes":{"cms":"core media","deliveryPlatform":"oscar","copyright":{"open":false,"legacy":{"webtrendsLicenceType":null}}},"contentInfo":{"authors":["Diana Kwon"],"publishedAt":1599696000,"publishedAtString":"2020-09-10","title":"More than 100 scientific journals have disappeared from the Internet","legacy":null,"publishedAtTime":null,"documentType":"aplusplus"},"journal":{"pcode":"nature","title":"nature","volume":null,"issue":null},"authorization":{"status":true},"features":[{"name":"furtherReadingSection","present":false}],"collection":null},"page":{"category":{"pageType":"article"},"attributes":{"template":"magazine mosaic","featureFlags":[{"name":"ab_test_news_feature","active":false}]},"search":null},"privacy":{},"version":"1.0.0","product":null,"session":null,"user":null,"backHalfContent":false}];
+</script>
+
+<script>
+ (function() {
+ function deleteCookie (name, domain) {
+ document.cookie = encodeURIComponent(name) +
+ '=' +
+ ';path=/' +
+ ';domain=' + domain +
+ ';expires=Thu, 01 Jan 1970 00:00:00 GMT';
+ }
+
+ var consentCookieParts = ('; ' + document.cookie).split('; OptanonConsent=');
+
+ if (consentCookieParts.length > 1) {
+ consentCookieParts.shift(); // remove redundant first part from the split array
+
+ // onetrust can set the same cookie multiple times with different domain specificities
+ for (let i=0; i<consentCookieParts.length; i++) {
+ var otCookieGroups = consentCookieParts[i].split('&groups=').pop().split('&').shift();
+
+ if (otCookieGroups.indexOf('C0001') === -1) {
+ deleteCookie('OptanonConsent', 'nature.com');
+ deleteCookie('OptanonAlertBoxClosed', 'nature.com');
+ }
+ }
+ }
+ })();
+</script>
+
+<script>
+ (function(w,d,t) {
+ function cc() {
+ var h = w.location.hostname;
+ if (h.indexOf('preview-www.nature.com') > -1) return;
+
+ var e = d.createElement(t),
+ s = d.getElementsByTagName(t)[0];
+
+ if (h.indexOf('nature.com') > -1) {
+ e.src = 'https://cdn.cookielaw.org/scripttemplates/otSDKStub.js';
+ e.setAttribute('data-domain-script', '83f2c78a-6cbc-4d1a-9088-3f8e8c4c7460');
+ } else {
+ e.src = '/static/js/cookie-consent-bundle.9d49adbc02.js';
+ e.setAttribute('data-consent', h);
+ }
+ s.parentNode.insertBefore(e, s);
+ }
+
+ !!w.google_tag_manager ? cc() : window.addEventListener('gtm_loaded', function() {cc()});
+ })(window,document,'script');
+</script>
+<script>
+ function OptanonWrapper() {
+ window.dataLayer.push({event:'OneTrustGroupsUpdated'});
+ document.activeElement.blur();
+ }
+</script>
+
+
+<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+ new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+ 'https://www.googletagmanager.com/gtm.js?id='+i+dl;
+
+
+ j.addEventListener('load', function() {
+ var _ge = new CustomEvent('gtm_loaded', { bubbles: true });
+ d.dispatchEvent(_ge);
+ });
+
+ f.parentNode.insertBefore(j,f);
+})(window,document,'script','dataLayer','GTM-NWDMT9Q');</script>
+
+
+
+</head>
+<body>
+
+
+
+<div role="banner" class="position-relative cleared z-index-50 background-white" data-test="top-containers">
+
+
+ <a class="c-skip-link u-hide-print" href="#content">Skip to main content</a>
+
+
+
+
+
+
+
+ <aside class="c-ad c-ad--728x90">
+ <div class="c-ad__inner" data-container-type="banner-advert">
+ <p class="c-ad__label">Advertisement</p>
+
+
+
+ <div id="article-doubleclickad-container">
+ <div id="div-gpt-ad-top-1"
+ class="div-gpt-ad advert leaderboard js-ad text-center hide-print grade-c-hide"
+ data-ad-type="top"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="728x90"
+ data-gpt-targeting="type=article;pos=top;artid=d41586-020-02610-z;doi=10.1038/d41586-020-02610-z;subjmeta=479,648,706;kwrd=Publishing">
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing"
+ alt="Advertisement"
+ width="728"
+ height="90"></a>
+ </noscript>
+ </div>
+</div>
+
+
+
+
+ </div>
+ </aside>
+
+
+
+
+
+ <div class="c-grade-c-banner u-hide">
+ <div class="c-grade-c-banner__container">
+
+ <p>Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain
+ the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in
+ Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles
+ and JavaScript.</p>
+
+ </div>
+ </div>
+
+
+
+
+ <header class="c-header c-header--brand-border" id="header" data-header>
+ <div class="c-header__row-border">
+ <div class="c-header__container">
+ <div class="c-header__layout">
+ <a href="/nature"
+ data-track="click" data-track-action="home" data-track-category="nature-150-split-header" data-track-label="image">
+ <picture class="c-header__logo">
+ <source srcset="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" media="(min-width: 769px)">
+ <img src="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" alt="Nature">
+ </picture>
+ </a>
+ <div class="c-header__layout">
+
+ <div class="c-header__site-navigation c-header__site-navigation--show-at-md"
+ data-test="siteindex-link">
+ <a class="c-header__link" href="https://www.nature.com/siteindex"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open nature research index" data-track-label="link">
+ <span>View all Nature Research journals</span>
+ </a>
+ </div>
+
+ <div class="c-header__site-navigation c-header__site-navigation--border">
+ <a class="c-header__link"
+ href="#search-menu"
+ data-header-expander
+ data-test="search-link" data-track="click" data-track-category="nature-150-split-header" data-track-action="open search tray" data-track-label="button">
+ <span>Search</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M16.48 15.455c.283.282.29.749.007 1.032a.738.738 0 01-1.032-.007l-3.045-3.044a7 7 0 111.026-1.026zM8 14A6 6 0 108 2a6 6 0 000 12z"/></svg>
+ </a>
+ <a href="/nams/svc/myaccount"
+ id="my-account"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="my account" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>My Account</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+<a href="https://idp.nature.com/authorize/natureuser?client_id&#x3D;grover&amp;redirect_uri&#x3D;https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z"
+ id="login-button"
+ style="display: none;"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="login" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>Login</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="c-header__container" data-test="c-header__container">
+ <ul class="c-header__menu">
+
+ <li class="c-header__item" data-test="explore-content-button">
+ <a href="#explore"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open explore expander" data-track-label="button">
+ <span>Explore <span class="c-header__show-text">our content</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item">
+ <a href="#journal-info"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open journal information expander" data-track-label="button">
+ <span>Journal info<span class="c-header__show-text">rmation</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item c-header__item--pipe">
+ <a class="c-header__link"
+ href="https://www.nature.com/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-category="nature-150-split-header"
+ data-track-label="link">
+ <span>Subscribe</span>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+
+ </header>
+
+
+
+
+ <div class="u-mb-16">
+ <div class="u-container">
+ <ol class="c-breadcrumbs">
+ <li class="c-breadcrumbs__item" id="breadcrumb0"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb1"><a class="c-breadcrumbs__link"
+ href="/"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:nature"><span itemprop="title">nature</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb1"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb2"><a class="c-breadcrumbs__link"
+ href="/nature/articles?type&#x3D;news"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:news"><span itemprop="title">news</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb2"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb3"><span itemprop="title">article</span></li>
+ </ol>
+ </div>
+ </div>
+
+
+
+
+
+
+</div>
+
+
+ <div id="content" class="article-page position-relative z-index-1">
+ <section class="container highlight-container article-page--news container-with-gap">
+ <article class="article-item article-item--open" itemscope="" itemtype="http://schema.org/NewsArticle"
+ data-track-component="news">
+ <div class="container cleared container-type-article" data-container-type="article" itemprop="articleBody">
+ <div class="content position-relative cleared clear mq1200-padded" data-component="article-container"
+ role="main">
+ <header class="article-item__header clear cleared pull--both">
+ <div class="article__type">NEWS
+ <div class="ml10 article__date">
+ <time itemprop="datePublished">10 September 2020</time>
+ </div>
+ </div>
+
+ <div class="clear cleared"></div>
+ <h1 class="article-item__title serif" itemprop="headline">More than 100 scientific journals have disappeared from the Internet</h1>
+
+ <div class="article-item__teaser-text serif">
+ Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk.
+ </div>
+ </header>
+
+ <div class="clear cleared"></div>
+
+ <div class="bordered-container clear cleared pull--both">
+ <div id="author-affiliations" class="tab-group text14" role="tablist" data-test="author-affiliations" data-tab-group>
+ <div class="cleared">
+
+ <div id="author-affiliation-news-0" class="tab-box js-box-wrapper">
+ <h3 id="author-affiliation-news-0-head" data-track="click" data-track-label="view author info" class="sans-serif strong tab tab-skin ma0" role="tab"
+ aria-controls="author-affiliation-news-0-content" data-tooltip="Show author information">
+ Diana Kwon
+ </h3>
+ <div id="author-affiliation-news-0-content" class="tab-content pin-right grid grid-12 last"
+ role="tabpanel">
+ <div class="pa10" aria-labelledby="author-affiliation-news-0-head">
+ <div class="clear cleared">
+
+
+ <div class="align-left">
+ <h4 class="sans-serif">Search for this author in:</h4>
+ <ul class="ma0 clean-list">
+ <li class="strong"><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd&#x3D;search&amp;term&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Pub Med" >Pub Med</a></li>
+
+ <li class="strong"><a href="https://www.nature.com/search?order&#x3D;date_desc&amp;q&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Nature.com" >Nature.com</a></li>
+
+ <li class="strong"><a href="https://scholar.google.co.uk/scholar?as_q&#x3D;&amp;btnG&#x3D;Search+Scholar&amp;as_sauthors&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Google Scholar" >Google Scholar</a></li>
+ </ul>
+ </div>
+
+
+
+ </div>
+ </div>
+ </div>
+ </div>
+
+ </div>
+</div>
+
+ </div>
+
+ <div class="clear cleared pull--both">
+ <ul class="social clean-list inline-list hide-print">
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="twitter" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="https://twitter.com/intent/tweet?text=More+than+100+scientific+journals+have+disappeared+from+the+Internet&url=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Twitter</title>
+ <desc>Share on Twitter</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M20.8125,11.4875 C21.42,11.10375 21.8875,10.49625 22.105,9.7725 C21.5375,10.1275 20.90875,10.385 20.23875,10.5225 C19.70625,9.9225 18.9425,9.545 18.0975,9.545 C16.475,9.545 15.16,10.9325 15.16,12.6425 C15.16,12.885 15.185,13.1225 15.235,13.3475 C12.7975,13.2175 10.63125,11.985 9.1825,10.11 C8.93,10.56875 8.785,11.10125 8.785,11.66875 C8.785,12.74375 9.30375,13.69125 10.09125,14.2475 C9.61125,14.23125 9.1575,14.09 8.76125,13.86 L8.76125,13.8975 C8.76125,15.3975 9.77375,16.65125 11.11875,16.935 C10.87125,17.0075 10.6125,17.04375 10.34375,17.04375 C10.15625,17.04375 9.96875,17.025 9.79125,16.98875 C10.16625,18.22125 11.24875,19.11875 12.535,19.1425 C11.52875,19.97375 10.2625,20.4675 8.885,20.4675 C8.6475,20.4675 8.415,20.455 8.185,20.42625 C9.485,21.30375 11.02875,21.81625 12.6875,21.81625 C18.09,21.81625 21.04375,17.095 21.04375,13.00125 L21.03625,12.60125 C21.61125,12.16375 22.11125,11.6175 22.50125,10.99625 C21.97375,11.2425 21.4075,11.40875 20.81375,11.48375 L20.8125,11.4875 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="facebook" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Facebook</title>
+ <desc>Share on Facebook</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15.89625,22.8625 L12.57125,22.8625 L12.57125,15.02125 L10.90875,15.02125 L10.90875,12.31875 L12.57125,12.31875 L12.57125,10.69625 C12.57125,8.4925 13.50875,7.18 16.175,7.18 L18.39375,7.18 L18.39375,9.8825 L17.00625,9.8825 C15.96875,9.8825 15.9,10.26 15.9,10.965 L15.895,12.3175 L18.4075,12.3175 L18.115,15.02 L15.89625,15.02 L15.89625,22.8625 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="email" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="mailto:?subject=More than 100 scientific journals have disappeared from the Internet&body=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share via E-Mail</title>
+ <desc>Share via E-Mail</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15,15.3269887 L10.6248577,11.9177869 C10.4236021,11.7609644 10.1299323,11.7927468 9.96892789,11.988775 C9.80792343,12.1848031 9.84055341,12.4708451 10.041809,12.6276676 L14.7012493,16.2584003 C14.8680779,16.3940555 15.1152493,16.4013884 15.2915244,16.2640313 C15.2939898,16.2622325 15.2963784,16.2603294 15.2987507,16.2584003 L19.958191,12.6276676 C20.1594466,12.4708451 20.1920766,12.1848031 20.0310721,11.988775 C19.8700677,11.7927468 19.5763979,11.7609644 19.3751423,11.9177869 L15,15.3269887 Z M9,10 L21,10 C21.5522847,10 22,10.4477153 22,11 L22,19 C22,19.5522847 21.5522847,20 21,20 L9,20 C8.44771525,20 8,19.5522847 8,19 L8,11 C8,10.4477153 8.44771525,10 9,10 Z"></path>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+</ul>
+
+ </div>
+
+
+
+
+ <div class="align-left">
+
+ <div class="article__body serif cleared">
+ <p>Scholarly journals are supposed to provide a lasting record of science. But over the past two decades, 176 open-access journals — and many of the papers published in them — have disappeared from the Internet, according to an analysis published on 27 August<sup><a href="#ref-CR1" data-track="click" data-action="anchor-link" data-track-label="go to reference" data-track-category="references">1</a></sup>.</p><p>“There shouldn’t really be any decay or loss in scientific publications, particularly those that have been open on the web,†says Mikael Laakso, an information scientist at the Hanken School of Economics in Helsinki, and a co-author of the study, which was posted on the arXiv preprint server. He and his colleagues identified 176 titles whose online presence vanished between 2000 and 2019.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"><h1 class="recommended__title serif">Investigating journals: The dark side of publishing</h1></a>
+ </aside></p><p>More than half of these journals were in the social sciences and humanities, although life sciences, health sciences, physical sciences and mathematics were also represented. Eighty-eight of the journals were affiliated with a scholarly society or a research institution. The analysis also identified 900 journals that are still online but seem to have stopped publishing papers, so might be vulnerable to vanishing in the near future.</p><p>The study lays out a "compelling case" for the vulnerability of online journals, says Elizabeth Lightfoot, a librarian at Florida International University in Miami.</p><h2>Vanishing journals</h2><p>Journals can disappear from the Internet for a number of reasons, says Laakso. The publisher might stop paying to keep its publication’s webpage afloat, for example, or journals might be hosted on an online platform that belongs to an academic institution and is left behind when the site or server is updated.</p><p>Journals are supposed to be preserved in digital archives when this happens. Services such as the LOCKSS (Lots of Copies Keep Stuff Safe) Program, which was launched by Stanford Libraries in 1999, aim to ensure that publications remain available even when the publisher is no longer around. LOCKSS works by making multiple copies of content that is stored on the servers of participating libraries, who pay an annual fee to have their collections preserved. Similar initiatives, including CLOCKSS, Portico and the Public Knowledge Project’s Preservation Network (PKP PN), have emerged over the past two decades. These vary in cost and coverage: Some work with libraries, others with publishers — services such as PKP PN are free for journals that sign up. Tens of thousands of titles are currently curated in such preservation schemes. But, Laakso says, there are dozens of journals that fall through the cracks.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"><h1 class="recommended__title serif">Radical open-access plan could spell end to journal subscriptions</h1></a>
+ </aside></p><p>Pinning down whether a journal is truly unavailable online is a challenge, because there is no single database that tracks the activity of open-access journals, says Lisa Matthias, one of the authors of the study and a PhD student at the Free University of Berlin. Databases such as the Directory of Open Access Journals (DOAJ) don’t keep track of journals that no longer publish — and journals that cease publishing or stop maintaining their presence on the web usually do so silently.</p><p>To find out how many journals had vanished, the team manually collected historical data from several lists of titles, including the DOAJ, Ulrichsweb and Scopus. Then they checked to see if any of the titles they identified were listed on the Keepers Registry, which keeps track of journals that are enrolled into digital preservation schemes. Finally, they went to the Internet Archive’s Wayback Machine to access snapshots of now-offline journals’ websites to see when they had last published, and when the content was last available on the Internet. Journals were considered “vanished†if less than 50% of their content was still freely available online (the researchers acknowledge that some journals could exist in print form or behind a paywall).</p><p>The majority of the 176 vanished journals had disappeared within 5 years of becoming inactive — the point at which they stopped publishing papers. Around one-third of them disappeared within one year of the last publication. The researchers used this ‘life cycle’ to estimate that another 900 inactive open-access journalscould be at risk of vanishing.</p><h2>Preserving the literature</h2><p>Subscription journals were not included in the study, Laakso says, because paywalls mean that they would have had to have used a different method to collect the data. He adds that because of this and other limitations, the study probably underestimates the number of journals that have disappeared. “It’s really hard to pin down when something doesn't absolutely exist, but we tried our best,†Laakso says. “We hope that there will be more refined and automatic ways to detect these in the future.â€</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-019-02038-0" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16870448.jpg"><h1 class="recommended__title serif">India culls hundreds more ‘dubious’ journals from government approved list</h1></a>
+ </aside></p><p>Thib Guicherd-Callin, the acting manager of the LOCKSS Program, says it’s not surprising that there are journals that aren't captured by existing preservation services. Although many groups have used the open-source LOCKSS software, efforts to launch digital preservation initiatives are still “woefully underfundedâ€, he adds. “The desire to preserve these at-risk works is there,†he adds, but few institutions are investing the resources necessary to identify these publications and make sure they’re included in a digital preservation scheme.</p><p>Matthias says that the responsibility for ensuring inactive journals don’t disappear should be shared between publishers, authors, librarians and preservation services. Lightfoot agrees that a coordinated and collaborative effort is necessary. However, she adds, “the twin challenges of what that effort might look like and who would fund it make the pathway forward murky at bestâ€.</p>
+ </div>
+
+ <div class="emphasis">doi: <a href="https://doi.org/10.1038/d41586-020-02610-z">https://doi.org/10.1038/d41586-020-02610-z</a></div>
+ <div class="anchor-link mt40" data-toggle="anchor-links"></div>
+ <div id="references" class="references" data-toggle="anchor-links-section" data-label="References" data-concertina="true">
+ <section aria-labelledby="Bib1"><div class="serif article-section js-article-section cleared clear" id="Bib1-section"><h2 class="js-section-title section-title strong position-relative tighten-line-height background-gray-light pt20 pb6 pl0 pr20 standard-space-below small-space-above mq640-pt10 mq640-pb10 mq640-pl20 mq640-mt0 mq640-ml-20 mq640-mr-20 extend-left" id="Bib1">References</h2><div class="pl20 mq875-pl0 js-collapsible-section" id="Bib1-content"><div data-container-section="references"><ol class="clean-list ma0 standard-space-below indented-list" data-test="references-list"><li class="small-space-below border-gray-medium border-bottom-1 position-relative js-ref-item" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Article" data-test="citation"><span class="indented-counter serif h2 tighten-line-height text-right position-absolute grade-c-hide">1.</span><p class="tiny-space-below" id="ref-CR1">Laakso, M., Matthias, L. &amp; Jahn, N. Preprint at <a href="https://arxiv.org/abs/2008.11933">https://arxiv.org/abs/2008.11933</a> (2020).</p><ul class="js-ref-links clean-list cleared strong sans-serif text13 hide-print small-space-below"><li class="pin-right"><ul class="clean-list ma0"></ul></li></ul></li></ol><p class="hide-print text-right"><a href="/articles/d41586-020-02610-z-references.ris" class="text14 sans-serif strong" data-track="click" data-track-action="download citation references" data-track-label="link">Download references</a></p></div></div></div></section>
+ </div>
+
+
+
+
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="inPage box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-inPage-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-inPage">
+ <input id="briefing-box-signup-form-inPage-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-inPage-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-inPage-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-inPage-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-inPage-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+
+
+ </div>
+
+ <aside class="article__aside align-right">
+ <div class="related-content shrink--aside hide-print">
+
+ <h3 class="aside__title sans-serif">Related Articles</h3>
+ <ul class="ma0 clean-list">
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click"
+ data-track-label="related article (rank:0)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ </noscript>
+
+ Radical open-access plan could spell end to journal subscriptions
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click"
+ data-track-label="related article (rank:1)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ </noscript>
+
+ Investigating journals: The dark side of publishing
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-020-01066-5" data-track="click"
+ data-track-label="related article (rank:2)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ </noscript>
+
+ Nature to join open-access Plan S, publisher says
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07557-w" data-track="click"
+ data-track-label="related article (rank:3)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ </noscript>
+
+ Funders flesh out details of Europe’s bold open-access plan
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07245-9" data-track="click"
+ data-track-label="related article (rank:4)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ </noscript>
+
+ AI peer reviewers unleashed to ease publishing grind
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/open-access-the-true-cost-of-science-publishing-1.12676" data-track="click"
+ data-track-label="related article (rank:5)">
+
+ The true cost of science publishing
+ </a>
+ </h3>
+ </li>
+
+ </ul>
+ </div>
+
+ <div class="article__subjects bordered-container shrink--aside hide-print">
+ <h3 class="aside__title sans-serif">Subjects</h3>
+ <ul class="ma0 subject-list cleared clean-list inline-list">
+
+ <li class="subject"><a href="/subjects/publishing" data-track="click"
+ data-track-label="subject (rank:0)">Publishing</a>
+ </li>
+
+ </ul>
+ </div>
+
+
+
+<div id="div-gpt-ad-right-2"
+ class="div-gpt-ad medium-rectangle advert js-ad text-center hide-print grade-c-hide"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="300x250"
+ data-gpt-targeting="pos=right;artid=/articles/d41586-020-02610-z;path=/articles/d41586-020-02610-z"
+ data-ad-type="right"
+ >
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z"
+ alt="Advertisement"
+ width="300"
+ height="250"/>
+ </a>
+ </noscript>
+</div>
+
+
+ <div class="nature-briefing--sidebar bordered-container shrink--aside hide-print">
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="sidebar box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Sign up to Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-sidebar-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-sidebar">
+ <input id="briefing-box-signup-form-sidebar-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-sidebar-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-sidebar-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-sidebar-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-sidebar-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+</div>
+
+ </aside>
+ </div>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="publisher" itemtype="https://schema.org/Organization">
+ <meta content="Macmillan Publishers Limited, part of Springer Nature" itemprop="name"/>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="author" itemtype="https://schema.org/Organization">
+ <meta content="Nature Editorial" itemprop="name"/>
+ </div>
+ <img src="/platform/track/article/d41586-020-02610-z" width="1" height="1" alt="" class="visually-hidden"/>
+</article>
+
+
+
+
+
+
+
+<div class="c-site-messages message hide u-hide-print c-site-messages--nature-briefing c-site-messages--nature-briefing-email-variant c-site-messages--nature-briefing-redesign-2020 sans-serif"
+data-component-id="nature-briefing-banner"
+data-component-expirydays="30"
+data-component-trigger-scroll-percentage="15"
+data-track="in-view"
+data-track-action="in-view"
+data-track-category="nature briefing"
+data-track-label="redesign banner visible">
+
+
+ <div class="c-site-messages__banner-large">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__form-container">
+
+
+
+ <div class="grid grid-12 last">
+ <div class="grid grid-4">
+ <img alt="Nature Briefing" src="/static/images/logos/nature-briefing-logo-n150-white.d81c9da3ec.svg" width="250" height="40">
+ <p class="c-site-messages--nature-briefing__strapline extra-tight-line-height">Sign up for the <em>Nature Briefing</em> newsletter — what matters in science, free to your inbox daily.</p>
+ </div>
+ <div class="grid grid-8 last">
+ <form action="/briefing/signup/formfeedback" method="post" data-location="banner" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-banner-signup-form-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBannerRedesign2020">
+ <input id="briefing-banner-signup-form-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBanner">
+ <label class="nature-briefing-banner__email-label" for="banner-EmailAddressInput">Email address</label>
+
+ <div class="nature-briefing-banner__email-wrapper">
+ <input class="nature-briefing-banner__email-input box-sizing text14" type="email" id="banner-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-emailbanner-email-input">
+ <button type="submit" class="nature-briefing-banner__submit-button box-sizing text14" data-test-element="briefing-emailbanner-signup-button">Sign up</button>
+ </div>
+
+ <div class="nature-briefing-banner__checkbox-wrapper grid grid-12 last">
+ <input class="nature-briefing-banner__checkbox-checkbox" id="gdpr-briefing-banner-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-emailbanner-gdpr-checkbox" required>
+ <label class="nature-briefing-banner__checkbox-label box-sizing text13 sans-serif block tighten-line-height" for="gdpr-briefing-banner-checkbox">I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+ </form>
+ </div>
+ </div>
+
+
+ </div>
+
+ </div>
+
+
+ <div class="c-site-messages__banner-small">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__content text14">
+ <span class="c-site-messages--nature-briefing__strapline strong serif">Get the most important science stories of the day, free in your inbox.</span>
+ <a class="nature-briefing__link text14 sans-serif"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner CTA to site"
+ data-test-element="briefing-banner-link"
+ target="_blank"
+ rel="noreferrer noopener"
+ href="/briefing/signup/?origin=Nature&amp;originReferralPoint=EmailBanner">Sign up for Nature Briefing
+ </a>
+ </div>
+
+ </div>
+
+</div>
+
+ </section>
+</div>
+ <script>
+ window.onload = function () {
+ Array.prototype.slice.call(document.querySelectorAll(".magazine-infographic > iframe"))
+ .forEach(function (element) {
+ function listener(event) {
+ if (event.data.height) {
+ if (element.id === event.data.requestData.id) {
+ element.setAttribute("height", event.data.height)
+ }
+ }
+ }
+
+ window.addEventListener("message", listener);
+ element.contentWindow.postMessage({name: "getHeight", id: element.id}, "*");
+ });
+ }
+ </script>
+ <script>
+ var linkEl = document.querySelector('.js-ctm');
+ if (linkEl && window.matchMedia && window.matchMedia(linkEl.media).matches) {
+ var fragment = document.createDocumentFragment();
+ var polyfillScript = document.createElement('script');
+ var header150Script = null;
+ var appScript = document.createElement('script');
+ var sharedEs6Script = document.createElement('script');
+
+ polyfillScript.src = 'https://cdn.polyfill.io/v2/polyfill.min.js?features=default,IntersectionObserver,Array.prototype.includes,Promise';
+ polyfillScript.async = false;
+ fragment.appendChild(polyfillScript);
+
+ appScript.src = '/static/js/magazine/magazine-mosaic.71d8740808.js';
+ appScript.async = false;
+ fragment.appendChild(appScript);
+
+ sharedEs6Script.src = '/static/js/shared-es6-bundle.c83ed51f05.js';
+ sharedEs6Script.async = false;
+ fragment.appendChild(sharedEs6Script);
+
+ header150Script = document.createElement('script');
+ header150Script.src = '/static/js/header-150-bundle.aaea96385f.js';
+ header150Script.async = false;
+ fragment.appendChild(header150Script);
+
+ document.body.appendChild(fragment);
+ }
+ </script>
+ <script>
+ var idp = {
+ hasNatureUserProof: function (hasProof) {
+ if (!hasProof) {
+ document.getElementById("my-account").setAttribute("style", "display: none;");
+ document.getElementById("login-button").setAttribute("style", "");
+ }
+ }
+ }
+ </script>
+ <script src="https://verify.nature.com/verify/nature.min.js"></script>
+ <noscript>
+ <img src="https://verify.nature.com/verify/nature.png" alt="" width="0" height="0"/>
+ </noscript>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Explore-our-content" data-test="Explore-our-content" id="explore" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Explore-our-content" class="c-header-expander__heading u-js-hide">Explore our content</h2>
+ <ul class="c-header-expander__list">
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/research"
+ data-track="click"
+ data-track-action="research"
+ data-track-label="link">
+ Research
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/news"
+ data-track="click"
+ data-track-action="news"
+ data-track-label="link">
+ News
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/opinion"
+ data-track="click"
+ data-track-action="opinion"
+ data-track-label="link">
+ Opinion
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/research-analysis"
+ data-track="click"
+ data-track-action="research analysis"
+ data-track-label="link">
+ Research Analysis
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/careers"
+ data-track="click"
+ data-track-action="careers"
+ data-track-label="link">
+ Careers
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/books-culture"
+ data-track="click"
+ data-track-action="books and culture"
+ data-track-label="link">
+ Books and Culture
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/podcast"
+ data-track="click"
+ data-track-action="podcasts"
+ data-track-label="link">
+ Podcasts
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/videoarchive"
+ data-track="click"
+ data-track-action="videos"
+ data-track-label="link">
+ Videos
+ </a>
+ </li>
+
+
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/current-issue"
+ data-track="click"
+ data-track-action="current issue"
+ data-track-label="link">
+ Current Issue
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-issues"
+ data-track="click"
+ data-track-action="browse issues"
+ data-track-label="link">
+ Browse Issues
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/articles"
+ data-track="click"
+ data-track-action="browse articles"
+ data-track-label="link">
+ Browse Articles
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/collections"
+ data-track="click"
+ data-track-action="browse collections"
+ data-track-label="link">
+ Browse Collections
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-subjects"
+ data-track="click"
+ data-track-action="browse subjects"
+ data-track-label="link">
+ Browse Subjects
+ </a>
+ </li>
+
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="https://www.nature.com/my-account/alerts/subscribe-journal?list-id&#x3D;1"
+ data-track="click"
+ data-track-action="Sign up for alerts"
+ data-track-label="link">Sign up for alerts<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m4 10h2.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-3.08578644l-1.12132034 1.1213203c-.18753638.1875364-.29289322.4418903-.29289322.7071068v.1715729h14v-.1715729c0-.2652165-.1053568-.5195704-.2928932-.7071068l-1.7071068-1.7071067v-3.4142136c0-2.76142375-2.2385763-5-5-5-2.76142375 0-5 2.23857625-5 5zm3 4c0 1.1045695.8954305 2 2 2s2-.8954305 2-2zm-5 0c-.55228475 0-1-.4477153-1-1v-.1715729c0-.530433.21071368-1.0391408.58578644-1.4142135l1.41421356-1.4142136v-3c0-3.3137085 2.6862915-6 6-6s6 2.6862915 6 6v3l1.4142136 1.4142136c.3750727.3750727.5857864.8837805.5857864 1.4142135v.1715729c0 .5522847-.4477153 1-1 1h-4c0 1.6568542-1.3431458 3-3 3-1.65685425 0-3-1.3431458-3-3z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Journal-information" id="journal-info" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Journal-information" class="c-header-expander__heading u-js-hide">Journal information</h2>
+ <ul class="c-header-expander__list">
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/about"
+ data-track="click"
+ data-track-action="about the journal"
+ data-track-label="link">
+ About the Journal
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-authors"
+ data-track="click"
+ data-track-action="for authors"
+ data-track-label="link">
+ For Authors
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-referees"
+ data-track="click"
+ data-track-action="for referees"
+ data-track-label="link">
+ For Referees
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/awards"
+ data-track="click"
+ data-track-action="awards"
+ data-track-label="link">
+ Awards
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-label="link">
+ Subscribe
+ </a>
+ </li>
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="http://mts-nature.nature.com/"
+ data-track="click"
+ data-track-action="Submit manuscript"
+ data-track-label="link">Submit manuscript<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m15 0c1.1045695 0 2 .8954305 2 2v5.5c0 .27614237-.2238576.5-.5.5s-.5-.22385763-.5-.5v-5.5c0-.51283584-.3860402-.93550716-.8833789-.99327227l-.1166211-.00672773h-9v3c0 1.1045695-.8954305 2-2 2h-3v10c0 .5128358.38604019.9355072.88337887.9932723l.11662113.0067277h7.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-7.5c-1.1045695 0-2-.8954305-2-2v-10.17157288c0-.53043297.21071368-1.0391408.58578644-1.41421356l3.82842712-3.82842712c.37507276-.37507276.88378059-.58578644 1.41421356-.58578644zm-.5442863 8.18867991 3.3545404 3.35454039c.2508994.2508994.2538696.6596433.0035959.909917-.2429543.2429542-.6561449.2462671-.9065387-.0089489l-2.2609825-2.3045251.0010427 7.2231989c0 .3569916-.2898381.6371378-.6473715.6371378-.3470771 0-.6473715-.2852563-.6473715-.6371378l-.0010428-7.2231995-2.2611222 2.3046654c-.2531661.2580415-.6562868.2592444-.9065605.0089707-.24295423-.2429542-.24865597-.6576651.0036132-.9099343l3.3546673-3.35466731c.2509089-.25090888.6612706-.25227691.9135302-.00001728zm-.9557137-3.18867991c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5zm-8.5-3.587-3.587 3.587h2.587c.55228475 0 1-.44771525 1-1zm8.5 1.587c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+
+
+ <div id="search-menu" class="c-header-expander c-header-expander--tray u-hide-print" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <h2 class="u-visually-hidden">Search</h2>
+ <div data-test="inline-search">
+ <div class="c-header-expander__keyline u-mb-16">
+ <form action="/search"
+ method="get"
+ role="search"
+ class="c-header-expander__form"
+ autocomplete="off"
+ data-dynamic-track-label
+ data-track="submit" data-track-action="search" data-track-label="form">
+ <label class="c-header-expander__heading" for="keywords">Article Search</label>
+ <div class="c-form-field u-display-flex">
+ <input type="text"
+ class="c-form-field__input u-flex-shrink"
+ id="keywords"
+ name="q"
+ value=""
+ placeholder="Search by keywords or author"
+ data-test="search-keywords">
+ <button type="submit" class="c-button c-button--contrast u-flex-static u-ml-8" data-test="search-submit">Search</button>
+ </div>
+ <p class="u-ma-0">
+ <a href="/search/advanced"
+ data-track="click" data-track-action="advanced search" data-track-label="link">
+ Advanced search
+ </a>
+ </p>
+ </form>
+ </div>
+ <div class="c-header-expander__keyline">
+ <h3 class="c-header-expander__heading">Quick links</h3>
+ <ul class="u-list-reset">
+ <li class="u-display-inline-block u-mr-24"><a href="/subjects" data-track="click" data-track-action="explore articles by subject" data-track-label="link">Explore articles by subject</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/naturecareers" data-track="click" data-track-action="find a job" data-track-label="link">Find a job</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to authors</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+
+
+
+
+<footer role="contentinfo" class="composite-layer">
+ <div class="u-mt-16 u-mb-16">
+ <div class="u-container">
+ <div class="u-display-flex u-flex-wrap u-justify-content-space-between">
+ <p class="c-meta u-ma-0 u-mr-24">
+
+</p>
+
+ <p class="c-meta u-ma-0">
+ <span aria-level="2" class="c-meta__item" itemprop="name">
+ Nature
+ </span>
+ <span class="c-meta__item">
+ <abbr title="International Standard Serial Number">ISSN</abbr> <span itemprop="issn">1476-4687</span> (online)
+ </span>
+ </p>
+ </div>
+ </div>
+</div>
+
+
+ <div itemscope itemtype="http://schema.org/Periodical">
+ <meta itemprop="publisher" content="Springer Nature">
+ <div class="c-footer">
+ <div class="u-container">
+ <div class="u-hide-print" data-track-component="footer">
+ <h2 aria-level="2" class="u-visually-hidden">nature.com sitemap</h2>
+ <div class="c-footer__header">
+ <div class="c-footer__logo">
+ <img alt="Nature Research" src="/static/images/logos/nature research-white-150.f4acf77e0c.svg" loading="lazy" width="200" height="26">
+ </div>
+ <ul class="c-menu c-menu--inherit u-mr-32">
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/company_info/index.html" data-track="click" data-track-action="about us" data-track-label="link">About us</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/press_room/press_releases.html" data-track="click" data-track-action="press releases" data-track-label="link">Press releases</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://press.nature.com/" data-track="click" data-track-action="press office" data-track-label="link">Press office</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://support.nature.com/support/home" data-track="click" data-track-action="contact us" data-track-label="link">Contact us</a></li>
+ </ul>
+ <ul class="c-menu c-menu--inherit">
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.facebook.com/nature/" aria-label="Nature on Facebook" data-track="click" data-track-action="facebook" data-track-label="link">
+ <svg class="u-icon u-mt-2 u-mb-2" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 20 20"><path d="M2.5 20C1.1 20 0 18.9 0 17.5v-15C0 1.1 1.1 0 2.5 0h15C18.9 0 20 1.1 20 2.5v15c0 1.4-1.1 2.5-2.5 2.5h-3.7v-7.7h2.6l.4-3h-3v-2c0-.9.2-1.5 1.5-1.5h1.6V3.1c-.3 0-1.2-.1-2.3-.1-2.3 0-3.9 1.4-3.9 4v2.2H8.1v3h2.6V20H2.5z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://twitter.com/nresearchnews?lang=en" aria-label="Nature on Twitter" data-track="click" data-track-action="twitter" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M17.6 4.1c.8-.5 1.5-1.4 1.8-2.4-.8.5-1.7.9-2.6 1-.7-.8-1.8-1.4-3-1.4-2.3 0-4.1 1.9-4.1 4.3 0 .3 0 .7.1 1-3.4 0-6.4-1.8-8.4-4.4C1 2.9.8 3.6.8 4.4c0 1.5.7 2.8 1.8 3.6C2 8 1.4 7.8.8 7.5v.1c0 2.1 1.4 3.8 3.3 4.2-.3.1-.7.2-1.1.2-.3 0-.5 0-.8-.1.5 1.7 2 3 3.8 3-1.3 1.1-3.1 1.8-5 1.8-.3 0-.7 0-1-.1 1.8 1.2 4 1.9 6.3 1.9C13.8 18.6 18 12 18 6.3v-.6c.8-.6 1.5-1.4 2-2.2-.7.3-1.5.5-2.4.6z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.youtube.com/channel/UCvCLdSgYdSTpWcOgEJgi-ng" aria-label="Nature on YouTube" data-track="click" data-track-action="youtube" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M7.9 12.6V6.9l5.4 2.8c0 .1-5.4 2.9-5.4 2.9zM19.8 6s-.2-1.4-.8-2c-.8-.8-1.6-.8-2-.9-2.8-.2-7-.2-7-.2s-4.2 0-7 .2c-.4 0-1.2 0-2 .9-.6.6-.8 2-.8 2S0 7.6 0 9.2v1.5c0 1.7.2 3.3.2 3.3s.2 1.4.8 2c.8.8 1.8.8 2.2.9 1.6.1 6.8.2 6.8.2s4.2 0 7-.2c.4 0 1.2-.1 2-.9.6-.6.8-2 .8-2s.2-1.6.2-3.3V9.2c0-1.6-.2-3.2-.2-3.2z"/></svg>
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="c-footer__grid">
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Discover content</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/siteindex" data-track="click" data-track-action="journals a-z" data-track-label="link">Journals A-Z</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/subjects/" data-track="click" data-track-action="article by subject" data-track-label="link">Articles by subject</a></li>
+ <li class="c-footer__item"><a href="https://nano.nature.com/" data-track="click" data-track-action="nano" data-track-label="link">Nano</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/protocolexchange/" data-track="click" data-track-action="protocol exchange" data-track-label="link">Protocol Exchange</a></li>
+ <li class="c-footer__item"><a href="https://www.natureindex.com/" data-track="click" data-track-action="nature index" data-track-label="link">Nature Index</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Publish with us</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/author_resources/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to Authors</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/peer_review/" data-track="click" data-track-action="guide to referees" data-track-label="link">Guide to Referees</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/publishing-with-npg/" data-track="click" data-track-action="open access" data-track-label="link">Open access</a></li>
+ <li ><a href="https://www.nature.com/reprints/" data-track="click" data-track-action="reprints and permissions" data-track-label="link">Reprints &amp; permissions</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Researcher services</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/authors/research-data" data-track="click" data-track-action="data research service" data-track-label="link">Research data</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/go/nr" data-track="click" data-track-action="language editing" data-track-label="link">Language editing</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/scientific-editing/" data-track="click" data-track-action="scientific editing" data-track-label="link">Scientific editing</a></li>
+ <li class="c-footer__item"><a href="https://masterclasses.nature.com/" data-track="click" data-track-action="nature masterclasses" data-track-label="link">Nature Masterclasses</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/researcher-training/" data-track="click" data-track-action="nature research academies" data-track-label="link">Nature Research Academies</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Libraries &amp; institutions</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/tools-services" data-track="click" data-track-action="librarian service and tools" data-track-label="link">Librarian service &amp; tools</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/manage-your-account/librarianportal" data-track="click" data-track-action="librarian portal" data-track-label="link">Librarian portal</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/about-open-access/information-for-institutions/" data-track="click" data-track-action="open research" data-track-label="link">Open research</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Advertising &amp; partnerships</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/digital-advertising/" data-track="click" data-track-action="advertising" data-track-label="link">Advertising</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/" data-track="click" data-track-action="partnerships and services" data-track-label="link">Partnerships &amp; Services</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/media-kits/" data-track="click" data-track-action="media kits" data-track-label="link">Media kits</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/branded-content-native-advertising/" data-track-action="branded content" data-track-label="link">Branded content</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Career development</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/naturecareers" data-track="click" data-track-action="nature careers" data-track-label="link">Nature Careers</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureconferences/" data-track="click" data-track-action="nature conferences" data-track-label="link">Nature<span class="visually-hidden"> </span> Conferences</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureevents/" data-track="click" data-track-action="nature events" data-track-label="link">Nature<span class="visually-hidden"> </span> events</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Regional websites</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="http://www.naturechina.com" data-track="click" data-track-action="nature china" data-track-label="link">Nature China</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nindia" data-track="click" data-track-action="nature india" data-track-label="link">Nature India</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ja-jp/" data-track="click" data-track-action="nature japan" data-track-label="link">Nature Japan</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ko-kr/" data-track="click" data-track-action="nature korea" data-track-label="link">Nature Korea</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nmiddleeast/" data-track="click" data-track-action="nature middle east" data-track-label="link">Nature Middle East</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Legal &amp; Privacy</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/info/privacy.html" data-track="click" data-track-action="privacy policy" data-track-label="link">Privacy Policy</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/cookies.html" data-track="click" data-track-action="use of cookies" data-track-label="link">Use of cookies</a></li>
+ <li class="c-footer__item"><a class="optanon-toggle-display" href="javascript:;" data-track="click" data-track-action="manage cookies" data-track-label="link">Manage cookies/Do not sell my data</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/legal_notice.html" data-track="click" data-track-action="legal notice" data-track-label="link">Legal notice</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/accessibility_statement.html" data-track="click" data-track-action="accessibility statement" data-track-label="link">Accessibility statement</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/tandc.html" data-track="click" data-track-action="terms and conditions" data-track-label="link">Terms &amp; Conditions</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/ccpa" data-track="click" data-track-action="california privacy statement" data-track-label="link">California Privacy Statement</a></li>
+ </ul>
+ </div>
+ </div>
+</div>
+
+
+ </div>
+ </div>
+ </div>
+
+ <div class="c-corporate-footer">
+ <div class="u-container">
+ <img src="/static/images/logos/sn-logo-white.ea63208b81.svg" alt="Springer Nature" loading="lazy" width="140" height="14"/>
+ <p class="c-corporate-footer__legal" data-test="copyright">&copy; 2020 Springer Nature Limited</p>
+ </div>
+</div>
+
+
+ <svg class="u-hide hide">
+ <symbol id="global-icon-chevron-right" viewBox="0 0 16 16">
+ <path d="M7.782 7L5.3 4.518c-.393-.392-.4-1.022-.02-1.403a1.001 1.001 0 011.417 0l4.176 4.177a1.001 1.001 0 010 1.416l-4.176 4.177a.991.991 0 01-1.4.016 1 1 0 01.003-1.42L7.782 9l1.013-.998z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-download" viewBox="0 0 16 16">
+ <path d="M2 14c0-.556.449-1 1.002-1h9.996a.999.999 0 110 2H3.002A1.006 1.006 0 012 14zM9 2v6.8l2.482-2.482c.392-.392 1.022-.4 1.403-.02a1.001 1.001 0 010 1.417l-4.177 4.177a1.001 1.001 0 01-1.416 0L3.115 7.715a.991.991 0 01-.016-1.4 1 1 0 011.42.003L7 8.8V2c0-.55.444-.996 1-.996.552 0 1 .445 1 .996z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-email" viewBox="0 0 18 18">
+ <path d="M1.995 2h14.01A2 2 0 0118 4.006v9.988A2 2 0 0116.005 16H1.995A2 2 0 010 13.994V4.006A2 2 0 011.995 2zM1 13.994A1 1 0 001.995 15h14.01A1 1 0 0017 13.994V4.006A1 1 0 0016.005 3H1.995A1 1 0 001 4.006zM9 11L2 7V5.557l7 4 7-4V7z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-institution" viewBox="0 0 18 18">
+ <path d="M14 8a1 1 0 011 1v6h1.5a.5.5 0 01.5.5v.5h.5a.5.5 0 01.5.5V18H0v-1.5a.5.5 0 01.5-.5H1v-.5a.5.5 0 01.5-.5H3V9a1 1 0 112 0v6h8V9a1 1 0 011-1zM6 8l2 1v4l-2 1zm6 0v6l-2-1V9zM9.573.401l7.036 4.925A.92.92 0 0116.081 7H1.92a.92.92 0 01-.528-1.674L8.427.401a1 1 0 011.146 0zM9 2.441L5.345 5h7.31z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-search" viewBox="0 0 22 22">
+ <path fill-rule="evenodd" d="M21.697 20.261a1.028 1.028 0 01.01 1.448 1.034 1.034 0 01-1.448-.01l-4.267-4.267A9.812 9.811 0 010 9.812a9.812 9.811 0 1117.43 6.182zM9.812 18.222A8.41 8.41 0 109.81 1.403a8.41 8.41 0 000 16.82z"/>
+ </symbol>
+ <symbol id="global-icon-info" viewBox="0 0 18 18">
+ <path d="m9 0c4.9705627 0 9 4.02943725 9 9 0 4.9705627-4.0294373 9-9 9-4.97056275 0-9-4.0294373-9-9 0-4.97056275 4.02943725-9 9-9zm0 7h-1.5l-.11662113.00672773c-.49733868.05776511-.88337887.48043643-.88337887.99327227 0 .47338693.32893365.86994729.77070917.97358929l.1126697.01968298.11662113.00672773h.5v3h-.5l-.11662113.0067277c-.42082504.0488782-.76196299.3590206-.85696816.7639815l-.01968298.1126697-.00672773.1166211.00672773.1166211c.04887817.4208251.35902055.761963.76398144.8569682l.1126697.019683.11662113.0067277h3l.1166211-.0067277c.4973387-.0577651.8833789-.4804365.8833789-.9932723 0-.4733869-.3289337-.8699473-.7707092-.9735893l-.1126697-.019683-.1166211-.0067277h-.5v-4l-.00672773-.11662113c-.04887817-.42082504-.35902055-.76196299-.76398144-.85696816l-.1126697-.01968298zm0-3.25c-.69035594 0-1.25.55964406-1.25 1.25s.55964406 1.25 1.25 1.25 1.25-.55964406 1.25-1.25-.55964406-1.25-1.25-1.25z" fill-rule="evenodd"/>
+ </symbol>
+ </svg>
+
+</footer>
+
+
+</body>
+</html>
+
diff --git a/python/tests/files/peerj_oa_article.html b/python/tests/files/peerj_oa_article.html
new file mode 100644
index 0000000..f2cf365
--- /dev/null
+++ b/python/tests/files/peerj_oa_article.html
@@ -0,0 +1,2365 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+ <meta charset="utf-8">
+
+ <title>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles [PeerJ]</title>
+
+
+ <link rel="dns-prefetch" href="https://d2pdyyx74uypu5.cloudfront.net/">
+ <link rel="dns-prefetch" href="http://static.peerj.com/">
+<link rel="dns-prefetch" href="https://doi.org">
+
+
+ <meta name="citation_title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"><meta name="citation_date" content="2018-02-13"><meta name="citation_doi" content="10.7717/peerj.4375"><meta name="citation_language" content="en"><meta name="citation_pdf_url" content="https://peerj.com/articles/4375.pdf"><meta name="citation_fulltext_html_url" content="https://peerj.com/articles/4375"><meta name="citation_volume" content="6"><meta name="citation_firstpage" content="e4375"><meta name="citation_keywords" content="Open access; Open science; Scientometrics; Publishing; Libraries; Scholarly communication; Bibliometrics; Science policy"><meta name="citation_journal_title" content="PeerJ"><meta name="citation_journal_abbrev" content="PeerJ"><meta name="citation_publisher" content="PeerJ Inc."><meta name="citation_issn" content="2167-8359"><meta name="citation_author" content="Heather Piwowar"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="heather@impactstory.org"><meta name="citation_author" content="Jason Priem"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="jason@impactstory.org"><meta name="citation_author" content="Vincent Larivière"><meta name="citation_author_institution" content="École de bibliothéconomie et des sciences de l’information, Université de Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author" content="Juan Pablo Alperin"><meta name="citation_author_institution" content="Canadian Institute for Studies in Publishing, Simon Fraser University, Vancouver, BC, Canada"><meta name="citation_author_institution" content="Public Knowledge Project, Canada"><meta name="citation_author" content="Lisa Matthias"><meta name="citation_author_institution" content="Scholarly Communications Lab, Simon Fraser University, Vancouver, Canada"><meta name="citation_author" content="Bree Norlander"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Ashley Farley"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Jevin West"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author" content="Stefanie Haustein"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="School of Information Studies, University of Ottawa, Ottawa, ON, Canada">
+ <meta name="description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+
+ <meta property="og:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+ <meta name="twitter:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:url" content="https://peerj.com/articles/4375">
+ <meta name="twitter:site" content="@thePeerJ">
+ <meta name="twitter:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta name="twitter:description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+ <meta property="og:type" content="article">
+ <meta property="og:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta property="og:url" content="https://peerj.com/articles/4375">
+ <meta property="og:site_name" content="PeerJ">
+
+
+ <link rel="alternate" type="application/pdf" href="/articles/4375.pdf">
+ <link rel="alternate" type="application/rdf+xml" href="/articles/4375.rdf">
+ <link rel="alternate" type="application/json" href="/articles/4375.json">
+ <link rel="alternate" type="application/xml" href="/articles/4375.xml">
+ <link rel="alternate" type="application/unixref+xml" href="/articles/4375.unixref">
+ <link rel="alternate" type="application/vnd.citationstyles.csl+json" href="/articles/4375.citeproc">
+ <link rel="alternate" type="application/bibjson+json" href="/articles/4375.bibjson">
+ <link rel="alternate" type="text/html" href="/articles/4375.html">
+
+ <link rel="canonical" href="https://peerj.com/articles/4375/">
+
+ <meta name="viewport" content="width=device-width,initial-scale=1">
+ <meta property="fb:app_id" content="534542813234464">
+
+ <link rel="stylesheet" href="/css/05b9c3d-27443c7.css" media="screen">
+
+<!--[if lt IE 9]>
+ <link rel="stylesheet" href="/assets/css/ie8.css" media="screen">
+<![endif]-->
+
+<!--[if lt IE 10]>
+ <link rel="stylesheet" href="/assets/css/ie9.css" media="screen">
+<![endif]-->
+
+ <style media="screen">html, body { height: 100%; }</style>
+ <link rel="stylesheet" href="https://cdn.peerj.com/webpack/vue-bundle.2cdd25e1.css">
+
+
+ <link rel="stylesheet" href="/css/a0c1a2c-04690d8.css" media="screen">
+
+ <link rel="stylesheet" href="/css/be477b9-1134171.css" media="screen">
+ <link rel="stylesheet" href="/css/3e4ba6d-c134b5f.css" media="print">
+ <script src="/js/36e5d51-2d7025c.js"></script>
+<script src="/assets/js/polyfills/includes.js"></script>
+<script src="/assets/js/polyfills/startsWith.js"></script><!--[if lt IE 9]>
+<script src="/assets/js/html5shiv.js"></script>
+
+<![endif]-->
+
+<!--[if lt IE 8]>
+<script src="/assets/js/json2.js"></script>
+<![endif]-->
+
+<script>
+ var PeerJ = {
+ Article: {},
+ User: {
+ anonymous: true },
+ Publication: {},
+ Production: {},
+ Event: {},
+ Com: {},
+ Payment: {},
+ Annotation: {},
+ Search: {},
+ Home: {},
+ Subjects: {},
+ Advocacy: {},
+ Job: {},
+ ContentAlert: {},
+ Tools: {}
+ };
+</script>
+
+
+<script>
+ var campaign_keywords = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term'];
+ var kw = '';
+ var lastUtms = {};
+ var firstUtms = {};
+ var allUtms = {};
+
+ function campaignParams() {
+ var index;
+ for (index = 0; index < campaign_keywords.length; ++index) {
+ kw = getQueryParam(document.URL, campaign_keywords[index]);
+ if (kw.length) {
+ lastUtms[campaign_keywords[index] + '-last'] = kw;
+ firstUtms[campaign_keywords[index] + '-first'] = kw;
+ allUtms[campaign_keywords[index] + '-all'] = kw;
+ }
+ }
+ }
+
+ function updatePreregCookie(preregCookie, firstUtmKey) {
+ var utmVal = firstUtms[firstUtmKey];
+ if (utmVal) {
+ var existingPreregCampaign = $.cookie(preregCookie);
+ var appendPreregCampaign;
+ if (!existingPreregCampaign) {
+ appendPreregCampaign = utmVal;
+ } else {
+ appendPreregCampaign = existingPreregCampaign + ',' + utmVal;
+
+ }
+ $.cookie(preregCookie, appendPreregCampaign, {expires: 365, path: "/"});
+ }
+ }
+
+ function getQueryParam(url, param) {
+ // Expects a raw URL
+ param = param.replace(/[[]/, "\[").replace(/[]]/, "\]");
+ var regexS = "[\?&]" + param + "=([^&#]*)",
+ regex = new RegExp( regexS ),
+ results = regex.exec(url);
+ if (results === null || (results && typeof(results[1]) !== 'string' && results[1].length)) {
+ return '';
+ } else {
+ return decodeURIComponent(results[1]).replace(/\W/gi, ' ');
+ }
+ }
+
+ function articlePageEvent() {
+ var articleContainer = $('.publication-jsondata');
+ if (articleContainer.length) {
+ var data = articleContainer.data('publication-meta');
+
+ // Must be public
+ if (data.publicationSubjects.length) {
+
+ var eventName = 'Viewed-article';
+ var preprint = data.preprint;
+ if (preprint) {
+ eventName = 'Viewed-preprint';
+ }
+
+ data['ip-hash'] = 'bf3914b8088a79fb1fcf39cb526631c0';
+ mixpanel.track(eventName, data);
+ }
+ }
+ }
+
+ function sectionListViewEvent() {
+ }
+</script>
+ <script>
+ // User agrees to terms on signup, so Mixpanel is OK
+ // On submit, update mixpanel distinct id
+ setTimeout(function () {
+ var regmixpanel = document.getElementById('fos_user_registration_form_mixpanelId');
+ if (regmixpanel) {
+ var distinctId = $.cookie('pj_mp_distinct');
+ if (!distinctId) {
+ distinctId = mixpanel.get_distinct_id();
+ }
+ console.log(distinctId);
+ regmixpanel.value = distinctId;
+ }
+ }, 1500);
+
+ // If logged out then check if consented to analytics cookies (if applicable to country)
+ // Run through cookieConsent only
+ PeerJ.Com.Mixpanel = new function() {
+ this.leadView = function() {
+ mixpanel.init('776a79e14e8f05a81ca92536c83f08b4', {
+ 'secure_cookie': true,
+ loaded: function (mixpanel) {
+ setTimeout(function () {
+ articlePageEvent();
+
+ sectionListViewEvent();
+
+
+
+ }, 1000);
+ }
+ });
+ }
+ };
+
+ campaignParams();
+ updatePreregCookie('pj_prereg_campaign', 'utm_campaign-first');
+ updatePreregCookie('pj_prereg_content', 'utm_content-first');
+ updatePreregCookie('pj_prereg_term', 'utm_term-first');
+ </script>
+
+
+
+ <script>(function(p,u,s,h,x){p.pushpad=p.pushpad||function(){(p.pushpad.q=p.pushpad.q||[]).push(arguments)};h=u.getElementsByTagName('head')[0];x=u.createElement('script');x.async=1;x.src=s;h.appendChild(x);})(window,document,'https://pushpad.xyz/pushpad.js');
+pushpad('init', 5977, {hostname: 'peerj.com'});
+</script>
+
+ <link rel="search" type="application/opensearchdescription+xml" href="https://peerj.com/articles/osd.xml" title="PeerJ">
+
+
+
+
+
+ <script>
+ // Run through cookieConsent only
+ PeerJ.Com.GA = new function() {
+ this.disabletracking = function() {
+ window['ga-disable-' + 'UA-31208920-1'] = true;
+ };
+
+ this.runGA = function() {
+ (function (i, s, o, g, r, a, m) {
+ i['GoogleAnalyticsObject'] = r;
+ i[r] = i[r] || function () {
+ (i[r].q = i[r].q || []).push(arguments)
+ }, i[r].l = 1 * new Date();
+ a = s.createElement(o),
+ m = s.getElementsByTagName(o)[0];
+ a.async = 1;
+ a.src = g;
+ m.parentNode.insertBefore(a, m)
+ })(window, document, 'script', 'https://www.google-analytics.com/analytics.js', 'ga');
+
+ ga('create', 'UA\u002D31208920\u002D1', 'auto');
+
+ // Removes last octet
+ ga('set', 'anonymizeIp', true);
+
+
+
+
+
+
+
+
+
+ ga('set', 'dimension4', ';Legal\u0020Issues\u003BScience\u0020Policy\u003BData\u0020Science;');
+
+ ga('require', 'displayfeatures');
+
+ ga('send', 'pageview');
+
+ window.setTimeout(function () {
+ ga('send', 'event', 'adjusted bounce rate', 'page visit 15 seconds or more');
+ }, 15000);
+
+
+ }
+ };
+ </script>
+ <script src="/js/8548491-f0f5b7c.js"></script>
+
+<link rel="apple-touch-icon" sizes="57x57" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-57x57.png">
+<link rel="apple-touch-icon" sizes="60x60" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-60x60.png">
+<link rel="apple-touch-icon" sizes="72x72" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-72x72.png">
+<link rel="apple-touch-icon" sizes="76x76" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-76x76.png">
+<link rel="apple-touch-icon" sizes="114x114" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-114x114.png">
+<link rel="apple-touch-icon" sizes="120x120" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-120x120.png">
+<link rel="apple-touch-icon" sizes="144x144" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-144x144.png">
+<link rel="apple-touch-icon" sizes="152x152" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-152x152.png">
+<link rel="apple-touch-icon" sizes="180x180" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-180x180.png">
+<link rel="icon" type="image/png" sizes="192x192" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/android-icon-192x192.png">
+<link rel="shortcut icon" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon.ico">
+<link rel="icon" type="image/png" sizes="32x32" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-32x32.png">
+<link rel="icon" type="image/png" sizes="96x96" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-96x96.png">
+<link rel="icon" type="image/png" sizes="16x16" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-16x16.png">
+<link rel="manifest" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/manifest.json">
+<meta name="msapplication-TileColor" content="#ffffff">
+<meta name="msapplication-TileImage" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/ms-icon-144x144.png">
+<meta name="msapplication-config" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/browserconfig.xml">
+<meta name="theme-color" content="#ffffff"></head>
+
+<body class="">
+
+ <!-- FreshDesk variable (TODO: move elsewhere) -->
+
+
+<nav class="navbar navbar-fixed-top navbar-inverse navbar-alpha" role="navigation"><div class="navbar-inner"><!-- .btn-navbar is used as the toggle for collapsed navbar content --><a class="btn btn-navbar pull-right" data-toggle="collapse" data-target=".nav-collapse"><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></a><!-- logo --><ul class="nav pull-left nav-sections nav-journal"><li class="dropdown"><a href="/" class="dropdown-toggle "
+ data-toggle="dropdown"><span id="navJournalTitle">PeerJ Journals</span><b class="caret"></b></a><ul class="dropdown-menu journal-list"><li><a href="/">PeerJ Publishing Overview</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">PeerJ – Life & Environment</a><ul class="dropdown-menu"><li><a href="/sections/">About the journal Sections</a></li><li class="divider"></li><li><a href="/sections/aquatic-biology/">Aquatic Biology</a></li><li><a href="/sections/biochemistry-biophysics-molecular-biology/">Biochemistry, Biophysics and Molecular Biology</a></li><li><a href="/sections/biodiversity-conservation/">Biodiversity and Conservation</a></li><li><a href="/sections/bioinformatics-genomics/">Bioinformatics and Genomics</a></li><li><a href="/sections/brain-cognition/">Brain and Cognition</a></li><li><a href="/sections/ecology/">Ecology</a></li><li><a href="/sections/environ-sci/">Environmental Science</a></li><li><a href="/sections/microbiology/">Microbiology</a></li><li><a href="/sections/paleontology-evolutionary-science/">Paleontology and Evolutionary Science</a></li><li><a href="/sections/plant-biology/">Plant Biology</a></li><li><a href="/sections/zoological-science/">Zoological Science</a></li></ul></li><li><a href="/computer-science/">
+ PeerJ Computer Science
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Physical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Organic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Inorganic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Analytical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Materials Science
+ </a></li><li class="divider"></li><li><a href="https://peerj.org/" target="_blank">Visit PeerJ.org and get involved</a></li></ul></li></ul><!-- mobile-only top nav items --><ul class="nav pull-left nav-about-phone hidden-desktop"><li class="dropdown"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><li><!-- checkout items --></li><li><!-- notifications --></li></ul><!-- sections --><ul class="nav pull-left nav-collapse nav-sections nav-sections-main collapse search-hide"><li class="dropdown visible-desktop"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><!-- more --><li class="dropdown"><a href="#" class="dropdown-toggle"
+ data-toggle="dropdown">More <b class="caret"></b></a><ul class="dropdown-menu" role="menu" aria-labelledby="dLabel"><li><a href="/expertrxiv/"><img src="/assets/images/icons/expertrxiv.png" style="width: 80px"/></a></li><li><a href="/subjects/">Subjects</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Search articles</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Peer-reviewed Journals</li><li><a tabindex="-1" href="/articles/?journal=peerj">PeerJ (Life, Biological, Environmental and Health Sciences)</a></li><li><a tabindex="-1" href="/articles/?journal=cs">PeerJ Computer Science</a></li><li><a tabindex="-1" href="/articles/?journal=pchem">PeerJ Physical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ochem">PeerJ Organic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ichem">PeerJ Inorganic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=achem">PeerJ Analytical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=matsci">PeerJ Materials Science</a></li><li role="presentation" class="dropdown-header">Preprints</li><li><a tabindex="-1" href="/preprints/">PeerJ Preprints</a></li></ul></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Table of contents</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Table of Contents - current and archives</li><li><a tabindex="-1" href="/medicine/">PeerJ - Medicine articles</a></li><li><a tabindex="-1" href="/biology/">PeerJ - Biology & Life science articles</a></li><li><a tabindex="-1" href="/environment/">PeerJ - Environmental Science articles</a></li><li><a tabindex="-1" href="/general/">PeerJ - General bio (stats, legal, policy, edu)</a></li><li class="divider"></li><li><a tabindex="-1" href="/cs/">PeerJ Computer Science</a></li><li class="divider"></li><li><a tabindex="-1" href="/preprints-toc/">PeerJ Preprints</a></li></ul></li><li><a href="/academic-boards/advisors/">Academic advisors</a></li><li><a href="/reviewer-match/">Volunteer to review</a></li><li><a href="/collections/">Collections</a></li><li><a href="/questions/">Discussions</a></li><li><a href="https://peerj.com/blog/">Blog</a></li><li><a href="/prepaid-publishing/">Prepaid Publishing</a></li><li><a href="/about/reviews/">Reviews and awards</a></li><li><a href="/spread-the-word/">Spread the word</a></li><li><a href="/about/">Who are we?</a></li><li><a href="/about/contact/">Contact</a></li></ul></li></ul><!-- search --><div class="nav nav-collapse collapse pull-right nav-search"><form class="navbar-search" action="/search/"><input name="q" type="search"
+ data-autocomplete-url="/search/"
+ class="search-query" placeholder="Search"><!--<i class="icon-search"></i>--></form></div><ul class="nav pull-right nav-collapse collapse search-hide nav-utilities"><!-- login desktop --><li><a id="front-page-login" href="/login">Login</a></li></ul><ul class="nav pull-right search-hide nav-shifter"></ul><!-- for authors, my manuscripts --><ul class="nav nav-center nav-collapse collapse search-hide pull-right"><!-- for authors --><li class="dropdown nav-authors"><a href="#" class="dropdown-toggle" data-toggle="dropdown"><i
+ class="icon-info4 icon-large nav-icon icomoon"></i><span class="visible-wide">AUTHORS</span><b class="caret"></b></a><ul class="dropdown-menu"><li><a href="/benefits/">Peer Journals Overview</a></li><li><a href="/about/author-instructions/">Submission Guidelines</a></li><li><a href="/subjects/">Subject Areas</a></li><li><a href="/academic-boards/">Editorial Board</a></li><li><a href="/about/editorial-criteria/">Editorial Criteria</a></li><li><a href="/pricing/">Pricing</a></li><li><a href="/about/FAQ/">General FAQ</a></li><li><a href="/computer-science/faq-cs/">Computer Science FAQ</a></li><li><a href="/about/aims-and-scope/">Aims and Scope</a></li><li><a href="/about/author-interviews/">Author Interviews</a></li><li><a href="/about/policies-and-procedures/">Policies and Procedures</a></li><!--<li><a href="#">Why PeerJ?</a></li>--></ul></li><!-- my manuscripts --><!-- note: dropdown classes used just to maintain display --><li class="nav-manuscripts dropdown"><a href="/new/" class="dropdown-toggle"><span>SUBMIT ARTICLE</span></a></li></ul></div></nav>
+
+ <div class="item-top-navbar">
+ <div class="item-top-navbar-inner">
+ <div class="container-fluid">
+ <div class="row-fluid">
+ <div class="span12">
+ <div class="item-metrics-counts-top-nav article-item-metrics-counts">
+ <span class="article-item-metrics-count visible-all">
+ <span data-count="citations">203</span>
+ <span class="article-item-metrics-label">Citations</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-html">&nbsp;</span>
+ <span class="article-item-metrics-label">Views</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-pdf">&nbsp;</span>
+ <span class="article-item-metrics-label">Downloads</span>
+ </span>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</div>
+
+ <div id="wrap">
+
+
+
+ <div id="nav-pad"></div>
+
+
+ <div class="container">
+
+ <noscript class="js-disabled-warning">
+ <div class="alert alert-danger">
+ <i class="icon icon-warning-sign"></i> Javascript is disabled in your browser. Please <a href="https://www.enable-javascript.com" target="_blank">enable Javascript</a> to view PeerJ.
+ </div>
+ </noscript>
+
+
+ <div class="row publication-jsondata" data-publication-meta="{&quot;publicationId&quot;:&quot;4375&quot;,&quot;Article-section&quot;:&quot;NA&quot;,&quot;journal&quot;:&quot;PeerJ&quot;,&quot;published&quot;:&quot;2018-02-13 08:54:18&quot;,&quot;preprint&quot;:false,&quot;publicationSubjects&quot;:[&quot;Legal Issues&quot;,&quot;Science Policy&quot;,&quot;Data Science&quot;],&quot;publicationInstitutions&quot;:[&quot;Simon Fraser University&quot;,&quot;University of Washington&quot;,&quot;University of Ottawa&quot;],&quot;publicationTop20Institution&quot;:true,&quot;publicationInstitutionPlan&quot;:true}">
+ <!-- Left sidebar -->
+ <div class="span1 article-sidebar">
+ <div class="article-sidebar-left">
+ <div class="sidebar-box sidebar-box--journal">
+ <a href="/" class="sidebar-box--journal-mask"></a>
+ <img src="https://d2pdyyx74uypu5.cloudfront.net/images/article/logos/article-logo-peerj.png">
+ </div>
+
+ <div id="btn-view-tweets" class="sidebar-box sidebar-box--tweet">
+ <div class="text-center">View 618 tweets <i class="icon-twitter"></i></div>
+ </div>
+
+ <a href="#related-research" class="sidebar-box sidebar-box--related text-center">
+ Related research
+ <i class="icon-angle-down"></i>
+ </a>
+
+ <!-- mobile only -->
+ <div class="item-leftside-actions">
+ <div class="sidebar-box sidebar-box--action js-download-modal-trigger">Download</div>
+
+ <div id="notification-actions-mobile" class="sidebar-box sidebar-box--action" data-href="/following/publication/4522/">
+ <span class="follow-btn " id="item-left-follow-btn"
+ title="Receive article updates" data-toggle="tooltip" data-success-modal="#followModal"
+ data-href="/follow/publication/4522/0/">
+ <span class="button_text_follow">Follow</span class="follow-btn publication-label publication-label-general publication-label-middle" id="item-left-follow-btn"
+ ></span>
+</div>
+
+
+
+ <div class="sidebar-box sidebar-box--social visible-desktop">
+ <div class="sidebar-box--social-title">Share</div>
+ <div class="d-flex">
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </div>
+</div>
+
+<div class="btn-group sidebar-box sidebar-box--action">
+ <a href="#" class="btn-share dropdown-toggle" data-toggle="dropdown">Share</a>
+
+ <ul class="dropdown-menu">
+ <li>
+ <a href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+ </ul>
+</div>
+
+ </div>
+
+ </div>
+
+ <div class="peer-reviewed visible-phone">
+ <i class="icon-ok"></i> PEER-REVIEWED
+ </div>
+
+ </div>
+
+ <div id="annotations-sidebar" class="span5"></div>
+
+ <!-- Middle col -->
+ <div id="article-item-middle" class="span7"
+ data-ms-type-entity="articles" data-ms-type-id="research-article" data-ms-type-text="Research-article">
+
+ <div id="article-tweets-container">
+ <div class="row-fluid article-tweets-header">
+ <div class="span9">
+ <h2><em>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</em></h2>
+ </div>
+ <div class="span3">
+ <div class="btn btn-inverse pull-right" id="btn-view-article"><span class="icon-file"></span> View article</div>
+ </div>
+ </div>
+ <div class="tweet-items"> <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1297703289707016194/-sYklkZs_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=164969574" target="_blank"><strong></strong> <span class="twitter-handle">@LorenAndreaEP</span></a>
+ <span class="item-tweet-date">11 days ago</span>
+ </div>
+ <div>RT @AMAldanaS: También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradore…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1293635358064807937/YCE7J6e-_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15271321" target="_blank"><strong>Rachel Borchardt</strong> <span class="twitter-handle">@ButternutSquash</span></a>
+ <span class="item-tweet-date">12 days ago</span>
+ </div>
+ <div>@ces43 May I recommend Piwowar and Priem et al&#039;s article for that topic? https://t.co/Fnm0vtYtKS</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1210228942415814656/L6yRkSyu_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1117109826" target="_blank"><strong>Ana M. Aldana</strong> <span class="twitter-handle">@AMAldanaS</span></a>
+ <span class="item-tweet-date">40 days ago</span>
+ </div>
+ <div>También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradores de 2018 en donde se evidencia la ventaja de publicar en green open access: . https://t.co/1HAmYlfoBP</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/982225468286840837/BM5R0jJh_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=982223918223130624" target="_blank"><strong>Scicomm</strong> <span class="twitter-handle">@ScicommBot</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/856499301358477312/GLL-DiUg_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=850296415708471297" target="_blank"><strong>Open Pharma</strong> <span class="twitter-handle">@_OpenPharma</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">102 days ago</span>
+ </div>
+ <div>@Mietmensch @unpaywall Gotcha. It&#039;s tough to generalize the answer to that, as it depends a lot on the specific journal and field. We dove into the details more in this paper, though: https://t.co/HRus7k3P0B</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">103 days ago</span>
+ </div>
+ <div>@dwhly @unpaywall @hpiwowar historical stats are in here: https://t.co/HRus7k3P0B
+
+prediction for future is here: https://t.co/ex0vvThc9G</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/456347532637896704/We-tZ-rF_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=13616592" target="_blank"><strong>Eric Sieverts</strong> <span class="twitter-handle">@sieverts</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/633201529575632897/5rB4RNtd_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=163244377" target="_blank"><strong>Hector Keun</strong> <span class="twitter-handle">@hectorkeun</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @OxonAndrew: A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact o…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1233869298344611840/suKOWJtS_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1024381399447613443" target="_blank"><strong>Asynchrony</strong> <span class="twitter-handle">@temporalization</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @egonwillighagen: the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJ…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/447652981291614208/RtR2dZtC_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=536409536" target="_blank"><strong>Andrew Singer</strong> <span class="twitter-handle">@OxonAndrew</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles†â¦@thePeerJâ© https://t.co/yCu96hCzMK</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJV72Uf https://t.co/DE9MPIKTdZ</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>@egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for values.</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1220321309411942408/nhm-dSur_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1215236299344502791" target="_blank"><strong>Open Science Community Maastricht</strong> <span class="twitter-handle">@OSCMaastricht</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1263564961068077059/CKFX9dV2_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=371391064" target="_blank"><strong>Marie E McVeigh</strong> <span class="twitter-handle">@JopieNet</span></a>
+ <span class="item-tweet-date">121 days ago</span>
+ </div>
+ <div>@lisalibrarian @ashleydfarley @andy_nobes Usual def of &quot;bronze&quot; in @our_research is free to read, but does not have CC license.
+https://t.co/T34fQja0nN</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">146 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+
+<div class="tweet-pagination pagination">
+
+ <ul>
+
+ <li class="active"><a href="#">1</a></li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2" class="page">2</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=3" class="page">3</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=4" class="page">4</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=5" class="page">5</a>
+ </li>
+
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2">Next</a>
+ </li>
+ </ul>
+
+ <hr>
+</div></div>
+</div>
+ <div id="article-main-container">
+ <div class="article-section-breadcrumb">
+ <span class="icon-angle-left"></span>
+ <span><a href="/"><em>PeerJ</em></a></span>
+ </div>
+
+
+ <div class="hidden-print">
+
+ <div id="article-preexisting" class="well peerj-paper-well" >
+ <i class="icon-pushpin icon-large"></i> Note that a <a href="/preprints/3119/">Preprint of this article</a> also exists, first published August 2, 2017.
+ </div>
+ </div>
+
+ <!-- Main article -->
+ <article itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle"><header class="article-meta front"><h1 class="article-title" itemprop="name headline">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</h1>
+<div class="article-authors">
+<span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-1" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-1" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Heather</span> <span class="surname" itemprop="familyName">Piwowar</span></span></a><a class="corresp" href="mailto:heather@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-2" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-2" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jason</span> <span class="surname" itemprop="familyName">Priem</span></span></a><a class="corresp" href="mailto:jason@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-3" data-jats-contrib-type="author" itemprop="author"><a href="author-3" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Vincent</span> <span class="surname" itemprop="familyName">Larivière</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-2" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-2">2</a>,<a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-4" data-jats-contrib-type="author" itemprop="author"><a href="author-4" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Juan Pablo</span> <span class="surname" itemprop="familyName">Alperin</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-4" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-4">4</a>,<a class="aff xref" href="#aff-5" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-5">5</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-5" data-jats-contrib-type="author" itemprop="author"><a href="author-5" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Lisa</span> <span class="surname" itemprop="familyName">Matthias</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-6" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-6">6</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-6" data-jats-contrib-type="author" itemprop="author"><a href="author-6" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Bree</span> <span class="surname" itemprop="familyName">Norlander</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-7" data-jats-contrib-type="author" itemprop="author"><a href="author-7" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Ashley</span> <span class="surname" itemprop="familyName">Farley</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-8" data-jats-contrib-type="author" itemprop="author"><a href="author-8" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jevin</span> <span class="surname" itemprop="familyName">West</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-9" data-jats-contrib-type="author" itemprop="author"><a href="author-9" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Stefanie</span> <span class="surname" itemprop="familyName">Haustein</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a>,<a class="aff xref" href="#aff-9" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-9">9</a></sup></span>
+</div>
+<div id="article-information">
+<div class="article-notes">
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-1">
+<span class="article-label-container"><a class="article-label">1</a></span><span itemprop="address"><span class="institution">Impactstory</span>, <span class="city">Sanford</span>, <span class="state">NC</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-2">
+<span class="article-label-container"><a class="article-label">2</a></span><span itemprop="address"><span class="institution">École de bibliothéconomie et des sciences de l’information, Université de Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-3">
+<span class="article-label-container"><a class="article-label">3</a></span><span itemprop="address"><span class="institution">Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-4">
+<span class="article-label-container"><a class="article-label">4</a></span><span itemprop="address"><span class="institution">Canadian Institute for Studies in Publishing, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="state">BC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-5">
+<span class="article-label-container"><a class="article-label">5</a></span><span itemprop="address"><span class="institution">Public Knowledge Project</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-6">
+<span class="article-label-container"><a class="article-label">6</a></span><span itemprop="address"><span class="institution">Scholarly Communications Lab, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-7">
+<span class="article-label-container"><a class="article-label">7</a></span><span itemprop="address"><span class="institution">Information School, University of Washington</span>, <span class="city">Seattle</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-8">
+<span class="article-label-container"><a class="article-label">8</a></span><span itemprop="address"><span class="institution">FlourishOA</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-9">
+<span class="article-label-container"><a class="article-label">9</a></span><span itemprop="address"><span class="institution">School of Information Studies, University of Ottawa</span>, <span class="city">Ottawa</span>, <span class="state">ON</span>, <span class="country">Canada</span></span>
+</div>
+</div>
+<dl class="article-identifiers">
+<dt> DOI</dt>
+<dd>
+<a href="https://doi.org/10.7717/peerj.4375" itemprop="sameAs">10.7717/peerj.4375</a><meta itemprop="sameAs" content="info:doi/10.7717/peerj.4375">
+</dd>
+</dl>
+<dl class="article-dates">
+<dt>Published</dt>
+<dd><time itemprop="datePublished">2018-02-13</time></dd>
+<dt>Accepted</dt>
+<dd><time data-itemprop="dateAccepted">2018-01-25</time></dd>
+<dt>Received</dt>
+<dd><time itemprop="dateCreated">2017-08-09</time></dd>
+</dl>
+<dl class="article-editors">
+<dt>Academic Editor</dt>
+<dd itemprop="editor" itemscope="itemscope" itemtype="http://schema.org/Person"><a itemprop="url" href="editor-1" class="contrib" data-jats-contrib-type="editor"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Robert</span> <span class="surname" itemprop="familyName">McDonald</span></span></a></dd>
+</dl>
+<dl class="article-subjects">
+<dt>Subject Areas</dt>
+<dd>
+<a class="subject" itemprop="about" href="/subjects/?filter=Legal%20Issues">Legal Issues</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Science%20Policy">Science Policy</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Data%20Science">Data Science</a>
+</dd>
+<dt>Keywords</dt>
+<dd>
+<span class="kwd" itemprop="keywords">Open access</span>, <span class="kwd" itemprop="keywords">Open science</span>, <span class="kwd" itemprop="keywords">Scientometrics</span>, <span class="kwd" itemprop="keywords">Publishing</span>, <span class="kwd" itemprop="keywords">Libraries</span>, <span class="kwd" itemprop="keywords">Scholarly communication</span>, <span class="kwd" itemprop="keywords">Bibliometrics</span>, <span class="kwd" itemprop="keywords">Science policy</span>
+</dd>
+</dl>
+<dl class="article-license">
+<dt>Copyright</dt>
+<dd>© <span itemprop="copyrightYear">2018</span> <span itemprop="copyrightHolder">Piwowar et al.</span>
+</dd>
+<dt>Licence</dt>
+<dd>
+ <span class="license-p">This is an open access article distributed under the terms of the <a class="ext-link" href="http://creativecommons.org/licenses/by/4.0/" rel="license" data-jats-ext-link-type="uri">Creative Commons Attribution License</a>, which permits unrestricted use, distribution, reproduction and adaptation in any medium and for any purpose provided that it is properly attributed. For attribution, the original author(s), title, publication source (PeerJ) and either DOI or URL of the article must be cited.</span>
+ </dd>
+</dl>
+<dl class="self-citation">
+<dt>Cite this article</dt>
+<dd>
+<span class="self-citation-authors">Piwowar H, Priem J, Larivière V, Alperin JP, Matthias L, Norlander B, Farley A, West J, Haustein S.</span> <span class="self-citation-year">2018</span>. <span class="self-citation-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</span>. <span itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="self-citation-journal" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ</span></span> <span class="self-citation-volume" itemprop="volumeNumber">6</span></span>:<span class="self-citation-elocation" itemprop="pageStart">e4375</span> <a href="https://doi.org/10.7717/peerj.4375" itemprop="url">https://doi.org/10.7717/peerj.4375</a>
+</dd>
+</dl>
+<div class="alert alert-success view-public-reviews">The authors have chosen to make <a href="/articles/4375/reviews/">the review history of this article</a> public.</div>
+</div>
+<div>
+<h2>Abstract</h2>
+<div class="abstract" itemprop="description">
+ <p>Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.</p>
+ </div>
+</div></header><main><div class="body" lang="en">
+ <section class="sec" id="intro">
+ <h2 class="heading">Introduction</h2>
+ <p id="p-1">The movement to provide open access (OA) to all research literature is now over fifteen years old. In the last few years, several developments suggest that after years of work, a sea change is imminent in OA. First, funding institutions are increasingly mandating OA publishing for grantees. In addition to the US National Institutes of Health, which mandated OA in 2008 (<a class="ext-link" href="https://publicaccess.nih.gov/index.htm" data-jats-ext-link-type="uri">https://publicaccess.nih.gov/index.htm</a>), the Bill and Melinda Gates Foundation (<a class="ext-link" href="http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy" data-jats-ext-link-type="uri">http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy</a>), the European Commission (<a class="ext-link" href="http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf" data-jats-ext-link-type="uri">http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf</a>), the US National Science Foundation (<a class="ext-link" href="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf" data-jats-ext-link-type="uri">https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf</a>), and the Wellcome Trust (<a class="ext-link" href="https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy" data-jats-ext-link-type="uri">https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy</a>), among others, have made OA diffusion mandatory for grantees. Second, several tools have sprung up to build value atop the growing OA corpus. These include discovery platforms like ScienceOpen and 1Science, and browser-based extensions like the Open Access Button, Canary Haz, and Unpaywall. Third, Sci-Hub (a website offering pirate access to full text articles) has built an enormous user base, provoking newly intense conversation around the ethics and efficiency of paywall publishing (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.11366.1" title="Looking into Pandora’s Box: the content of Sci-Hub and its usage [version 1; referees: 2 approved, 2 approved with reservations]" data-jats-ref-type="bibr" data-jats-rid="ref-26">Greshake, 2017</a>). Academic social networks like ResearchGate and Academia.edu now offer authors an increasingly popular but controversial solution to author self-archiving (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2016.08.002" title="Hybrid open access—a longitudinal study" data-jats-ref-type="bibr" data-jats-rid="ref-8">Björk, 2016a</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>). Finally, the increasing growth in the cost of toll-access subscriptions, particularly via so-called “Big Deals†from publishers, has begun to force libraries and other institutions to initiate large-scale subscription cancellations; recent examples include Caltech, the University of Maryland, University of Konstanz, Université de Montréal, and the national system of Peru (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>; <a class="xref xref-bibr" href="https://doi.org/10.1038%2Fnature.2016.21223" title="Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals" data-jats-ref-type="bibr" data-jats-rid="ref-41">Schiermeier &amp; Mega, 2017</a>; <a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/" title="When the wolf finally arrives: big deal cancelations in North American Libraries" data-jats-ref-type="bibr" data-jats-rid="ref-1">Anderson, 2017a</a>; <a class="xref xref-bibr" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/" title="Teurer als die Wissenschaft erlaubt" data-jats-ref-type="bibr" data-jats-rid="ref-47">Université Konstanz, 2014</a>). As the toll-access status quo becomes increasingly unaffordable, institutions are looking to OA as part of their “Plan B†to maintain access to essential literature (<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>).</p>
+ <p id="p-2">Open access is thus provoking a new surge of investment, controversy, and relevance across a wide group of stakeholders. We may be approaching a moment of great importance in the development of OA, and indeed of the scholarly communication system. However, despite the recent flurry of development and conversation around OA, there is a need for large-scale, high-quality data on the growth and composition of the OA literature itself. In particular, there is a need for a data-driven “state of OA†overview that is (a) large-scale, (b) up-to-date, and (c) reproducible. This paper attempts to provide such an overview, using a new open web service called oaDOI that finds links to legally-available OA scholarly articles.<a class="xref xref-fn" href="#fn-1" data-jats-ref-type="fn" data-jats-rid="fn-1"><sup>1</sup></a> Building on data provided by the oaDOI service, we answer the following questions:</p>
+ <ol class="list" id="list-1" data-jats-list-type="order">
+ <li class="list-item">
+<p id="p-4">What percentage of the scholarly literature is OA, and how does this percentage vary according to publisher, discipline, and publication year?</p>
+ </li>
+ <li class="list-item">
+<p id="p-5">Are OA papers more highly-cited than their toll-access counterparts?</p>
+ </li>
+ </ol>
+ <p id="p-6">The next section provides a brief review of the background literature for this paper, followed by a description of the datasets and methods used, as well as details on the definition and accuracy of the oaDOI categorization. Results are then presented, in turn, for each research question, and are followed by a general discussion and conclusions.</p>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Literature Review</h2>
+ <p id="p-7">Fifteen years of OA research have produced a significant body of literature, a complete review of which falls outside the scope of this paper (for recent, in-depth reviews, see <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.8460.3" title="The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)" data-jats-ref-type="bibr" data-jats-rid="ref-46">Tennant et al. (2016)</a> and <a class="xref xref-bibr" href="https://doi.org/10.7554%2FeLife.16800" title="How open science helps researchers succeed" data-jats-ref-type="bibr" data-jats-rid="ref-36">McKiernan et al. (2016)</a>. Here we instead briefly review three major topics from the OA literature: defining OA and its subtypes, assessing the prevalence of OA, and examining the relative citation impact of OA.</p>
+ <p id="p-8">Despite the large literature on OA, the term itself remains “somewhat fluid†(Antelman, 2004), making an authoritative definition challenging. The most influential definition of OA comes from the 2002 Budapest Open Access Initiative (BOAI), and defines OA as making content both <i>free to read</i> and <i>free to reuse</i>, requiring the opportunity of OA users to “crawl (articles) for indexing, pass them as data to software, or use them for any other lawful purpose.†In practice, the BOAI definition is roughly equivalent to the popular “CC-BY†Creative Commons license (<a class="xref xref-bibr" href="https://creativecommons.org/licenses/by/4.0/" title="Attribution 4.0 International (CC BY 4.0)" data-jats-ref-type="bibr" data-jats-rid="ref-19">Creative Commons, 2018</a>). However, a number of other sources prefer a less strict definition, requiring only that OA “makes the research literature free to read online†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003" title="The nine flavours of open access scholarly publishing" data-jats-ref-type="bibr" data-jats-rid="ref-51">Willinsky, 2003</a>), or that it is “digital, online, [and] free of charge.†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009" title="Status of open access in the biomedical field in 2005" data-jats-ref-type="bibr" data-jats-rid="ref-34">Matsubayashi et al., 2009</a>). Others have suggested it is more valuable to think of OA as a spectrum (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2016.1182672" title="Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool" data-jats-ref-type="bibr" data-jats-rid="ref-17">Chen &amp; Olijhoek, 2016</a>).</p>
+ <p id="p-9">Researchers have identified a number of subtypes of OA; some of these have near-universal support, while others remain quite controversial. We will not attempt a comprehensive list of these, but instead note several that have particular relevance for the current study.</p>
+ <ul class="list" id="list-2" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-10">Libre OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): extends user’s rights to read and also to reuse literature for purposes like automated crawling, archiving, or other purposes. The Libre OA definition is quite similar to the BOAI definition of OA.</p>
+ </li>
+ <li class="list-item">
+<p id="p-11">Gratis OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): in contrast to Libre, Gratis extends <i>only</i> rights to read articles.</p>
+ </li>
+ <li class="list-item">
+<p id="p-12">Gold OA: articles are published in an “OA journal,†a journal in which all articles are open directly on the journal website. In practice, OA journals are most often defined by their inclusion in the Directory of Open Access Journals (DOAJ) (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>; <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al., 2012</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-13">Green OA: Green articles are published in a toll-access journal, but self-archived in an OA archive. These “OA archives†are either disciplinary repositories like ArXiv, or “institutional repositories (IRs) operated by universities, and the archived articles may be either the published versions, or electronic preprints (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>). Most Green OA articles do not meet the BOAI definition of OA since they do not extend reuse rights (making them Gratis OA).</p>
+ </li>
+ <li class="list-item">
+<p id="p-14">Hybrid OA: articles are published in a subscription journal but are immediately free to read under an open license, in exchange for an an article processing charge (APC) paid by authors (<a class="xref xref-bibr" href="https://doi.org/10.1241%2Fjohokanri.41.678" title="Free internet access to traditional journals" data-jats-ref-type="bibr" data-jats-rid="ref-50">Walker &amp; Soichi, 1998</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-15">Delayed OA: articles are published in a subscription journal, but are made free to read after an embargo period (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=Willinsky&amp;publication_year=2009" title="The access principle: the case for open access to research and scholarship" data-jats-ref-type="bibr" data-jats-rid="ref-52">Willinsky, 2009</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-16">Academic Social Networks (ASN): Articles are shared by authors using commercial online social networks like ResearchGate and Academia.edu. While some include these in definitions of OA (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>), others argue that content shared on ASNs is not OA at all. Unlike Green OA repositories, ASNs do not check for copyright compliance, and therefore as much as half their content is illegally posted and hosted (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-017-2291-4" title="Copyright compliance and infringement in ResearchGate full-text journal articles" data-jats-ref-type="bibr" data-jats-rid="ref-30">Jamali, 2017</a>). This raises concerns over the persistence of content, since, as was the case in October 2017, publishers can and do issue large-scale takedown notices to ASN ordering the removal of infringing content (<a class="xref xref-bibr" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement" title="Publishers take ResearchGate to court, alleging massive copyright infringement" data-jats-ref-type="bibr" data-jats-rid="ref-15">Chawla, 2017</a>). Others have raised questions about the sustainability and ethics of ASN services themselves (<a class="xref xref-bibr" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html" title="A social networking site is not an open access repository" data-jats-ref-type="bibr" data-jats-rid="ref-22">Fortney &amp; Gonder, 2015</a>). Due to these concerns, and inconsistent support from the literature, we exclude ASN-hosted content from our definition of OA.<a class="xref xref-fn" href="#fn-2" data-jats-ref-type="fn" data-jats-rid="fn-2"><sup>2</sup></a> </p>
+ </li>
+ <li class="list-item">
+<p id="p-18">“Black OAâ€: Articles shared on illegal pirate sites, primarily Sci-Hub and LibGen. Although (<a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1096" title="Gold, green, and black open access" data-jats-ref-type="bibr" data-jats-rid="ref-10">Björk, 2017</a>) labels these articles as a subtype of OA, the literature has nearly no support for including Sci-Hub articles in definitions of OA. Given this, we exclude Sci-Hub and LibGen content from our definition of OA.</p>
+ </li>
+ </ul>
+ <p id="p-19">Based on the consensus (and in some cases, lack of consensus) around these definitions and subtypes, we will use the following definition of OA in the remainder of this paper: <b>OA articles are free to read online, either on the publisher website or in an OA repository.</b></p>
+ <section class="sec">
+ <h3 class="heading">Prevalence of OA</h3>
+ <p id="p-20">Many studies have estimated what proportion of the literature is available OA, including <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0011273" title="Open access to the scientific journal literature: situation 2009" data-jats-ref-type="bibr" data-jats-rid="ref-12">Björk et al. (2010)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1186%2F1741-7015-10-124" title="Anatomy of open access publishing: a study of longitudinal development and internal structure" data-jats-ref-type="bibr" data-jats-rid="ref-31">Laakso &amp; Björk (2012)</a>, <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al. (2012)</a>, <a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al. (2013)</a>, <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> and <a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen (2013)</a>. We are not aware of any studies since 2014. The most recent two analyses estimate that more than 50% of papers are now freely available online, when one includes both OA and ASNs. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a>, the most comprehensive study to date, estimates that of papers published between 2011 and 2013, 12% of articles could be retrieved from the journal website, 6% from repositories, and 31% by other mechanisms (including ASNs). <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> also found that the availability of papers published between 1996 and 2011 increased by 4% between April 2013 and April 2014, noting that “backfilling†is a significant contributor to green OA. Their discipline-level analysis confirmed the findings of other studies, that the proportion of OA is relatively high in biomedical research and math, while notably low in engineering, chemistry, and the humanities.</p>
+ <p id="p-21">This <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> study is of particular interest because it used automated web scraping to find and identify OA content; most earlier efforts have relied on laborious manual checking of the DOAJ, publisher webpages, Google, and/or Google Scholar (though see <a class="xref xref-bibr" href="http://arxiv.org/abs/cs/0606079" title="Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact" data-jats-ref-type="bibr" data-jats-rid="ref-27">Hajjem, Harnad &amp; Gingras (2006)</a> for a notable early exception). By using automated methods, Archambault et al. were able to sample hundreds of thousands of articles, greatly improving statistical power and supporting more nuanced inferences. Moreover, by creating a system that indexes OA content, they address a major concern in the world of OA research; as <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a> observes: “A major challenge for research...has been the lack of comprehensive indexing for both OA journals and their articles.†The automated system of <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> is very accurate—it only misclassifies a paper as OA 1% of the time, and finds about 75% of all OA papers that exist online, as per <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. However, the algorithm is not able to distinguish Gold from Hybrid OA. More problematically for researchers, the database used in the study is not open online for use in follow-up research. Instead, the data has since been used to build the commercial subscription-access database 1science (<a class="ext-link" href="http://www.1science.com/oanumbr.html" data-jats-ext-link-type="uri">http://www.1science.com/oanumbr.html</a>).</p>
+ </section>
+ <section class="sec">
+ <h3 class="heading">The open access citation advantage</h3>
+ <p id="p-22">Several dozen studies have compared the citation counts of OA articles and toll-access articles. Most of these have reported higher citation counts for OA, suggesting a so-called “open access citation advantage†(OACA); several annotated bibliographies have been created to track this literature (<a class="xref xref-bibr" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/" title="The open access citation advantage: list of studies until 2015" data-jats-ref-type="bibr" data-jats-rid="ref-43">SPARC Europe, 2015</a>; <a class="xref xref-bibr" href="https://doi.org/10.5062%2FF4Q81B0W" title="Open access citation advantage: an annotated bibliography" data-jats-ref-type="bibr" data-jats-rid="ref-49">Wagner, 2010</a>; <a class="xref xref-bibr" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D" title="The open access citation advantage" data-jats-ref-type="bibr" data-jats-rid="ref-45">Tennant, 2017</a>). The OACA is not universally supported. Many studies supporting the OACA have been criticised on methodological grounds (<a class="xref xref-bibr" href="https://doi.org/10.3163%2F1536-5050.99.3.008" title="The impact of free access to the scientific literature: a review of recent research" data-jats-ref-type="bibr" data-jats-rid="ref-21">Davis &amp; Walters, 2011</a>), and an investigation using the randomized-control trial method failed to find evidence of an OACA (<a class="xref xref-bibr" href="https://doi.org/10.1096%2Ffj.11-183988" title="Open access, readership, citations: a randomized controlled trial of scientific journal publishing" data-jats-ref-type="bibr" data-jats-rid="ref-20">Davis, 2011</a>). However, recent investigations using robust methods have continued to observe an OACA. For instance, <a class="xref xref-bibr" href="https://doi.org/10.1111%2Fecin.12064" title="Identifying the effect of open access on citations using a panel of science journals" data-jats-ref-type="bibr" data-jats-rid="ref-35">McCabe &amp; Snyder (2014)</a> used a complex statistical model to remove confounding effects of author selection (authors may selectively publish their higher-impact work as OA), reporting a small but meaningful 8% OACA. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> describe a 40% OACA in a massive sample of over one million articles using field-normalized citation rates. <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0159614" title="The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)" data-jats-ref-type="bibr" data-jats-rid="ref-38">Ottaviani (2016)</a> used a natural experiment as articles (not selected by authors) emerged from embargoes to become OA, and reports a 19% OACA excluding the author self-selection bias for older articles outside their prime citation years.</p>
+ </section>
+ </section>
+ <section class="sec" id="methods">
+ <h2 class="heading">Methods</h2>
+ <section class="sec">
+ <h3 class="heading">OA determination</h3>
+ <section class="sec">
+ <h4 class="heading">Classifications</h4>
+ <p id="p-23">We classify publications into two categories, OA and Closed. As described above, we define OA as <i>free to read online, either on the publisher website or in an OA repository</i>; all articles not meeting this definition were defined as Closed. We further divide the OA literature into one of four exclusive subcategories, resulting in a five-category classification system for articles:</p>
+ <ul class="list" id="list-3" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-24"><b>Gold</b>: Published in an open-access journal that is indexed by the DOAJ.</p>
+ </li>
+ <li class="list-item">
+<p id="p-25"><b>Green</b>: Toll-access on the publisher page, but there is a free copy in an OA repository.</p>
+ </li>
+ <li class="list-item">
+<p id="p-26"><b>Hybrid</b>: Free under an open license in a toll-access journal.</p>
+ </li>
+ <li class="list-item">
+<p id="p-27"><b>Bronze</b>: Free to read on the publisher page, but without an clearly identifiable license.</p>
+ </li>
+ <li class="list-item">
+<p id="p-28"><b>Closed</b>: All other articles, including those shared only on an ASN or in Sci-Hub.</p>
+ </li>
+ </ul>
+ <p id="p-29">These categories are largely consistent with their use throughout the OA literature, although a few clarifications are useful. First, we (like many other OA studies) do not include ASN-hosted content as OA. Second, categories are exclusive, and publisher-hosted content takes precedence over self-archived content. This means that if an article is posted in both a Gold journal and an OA repository, we would classify it as Gold, not Green. Put another way, publisher-hosted content can “shadow†archived articles that would otherwise be Green. This definition of Green (“available in a repository but <i>not</i> available from the publisherâ€) is often used in the OA literature (including by Steven Harnad, the coiner of the Green and Gold terms <a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>), but this usage is not unanimous. Some studies allow a given article to be <i>both</i> Gold and Green; compared to these, our classification system does undercount Green. Hybrid articles share properties with Gold articles (both are free to read and are licensed for re-use), but differ in the venue of publication (i.e., Hybrid articles are published in journals not considered open access by the DOAJ) and in that Hybrid articles are not necessarily immediately available (i.e., they may only be freely available after an embargo). We also add a novel subcategory, Bronze. Bronze shares attributes of Gold and Hybrid; like both, Bronze OA articles are publisher-hosted. Unlike Gold OA, Bronze articles are not published in journals considered open access in the DOAJ. Unlike Hybrid, Bronze articles carry no license information. Although this lack of identifiable license may not be intentional, without an identifiable license, the articles are free to read but do not allow extended reuse rights beyond reading. It is also not clear if Bronze articles are temporarily or permanently available to read for free.</p>
+ <p id="p-30">Finally, we should add that, although our categories of choice reflect the OA literature, they do not necessarily reflect the more complex reality of scholarly publishing today. Organizations like SciELO and Redalyc in Latin America have been acting simultaneously as publishers and repositories and many of the articles found on their site do not fall neatly into the above categories (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010" title="The SciELO open access: a gold way from the south" data-jats-ref-type="bibr" data-jats-rid="ref-39">Packer, 2010</a>).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">The oaDOI system</h4>
+ <p id="p-31">We assigned the categories above by calling the oaDOI service with a DOI for each item. The oaDOI returns a link to a legally-available OA version of the article, when one is available (<a class="ext-link" href="https://oadoi.org/" data-jats-ext-link-type="uri">https://oadoi.org/</a>). It contains records for all 88 million Crossref DOIs.<a class="xref xref-fn" href="#fn-3" data-jats-ref-type="fn" data-jats-rid="fn-3"><sup>3</sup></a> The oaDOI service crawls, aggregates, normalizes, and verifies data from many sources including PMC (<a class="ext-link" href="https://www.ncbi.nlm.nih.gov/pmc/" data-jats-ext-link-type="uri">https://www.ncbi.nlm.nih.gov/pmc/</a>), BASE (<a class="ext-link" href="https://www.base-search.net/about/en/" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/</a>), DOAJ (<a class="ext-link" href="https://doaj.org/" data-jats-ext-link-type="uri">https://doaj.org/</a>), and thousands of institutional repositories and publishers. The oaDOI system offers a fast, free API with no rate-limits, allowing it to support a variety of other services and tools. At the time of writing, oaDOI processes approximately 500,000 requests daily–roughly twice the daily uses of Sci-Hub<a class="xref xref-fn" href="#fn-4" data-jats-ref-type="fn" data-jats-rid="fn-4"><sup>4</sup></a> (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1" title="Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)" data-jats-ref-type="bibr" data-jats-rid="ref-29">Himmelstein et al., 2017</a>). The majority of this volume comes from around 700 academic libraries, who use oaDOI to help readers find articles where the library has no subscription access, addressing the discoverability problem (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen, 2013</a>). The oaDOI service also powers the Unpaywall browser extension, which helps readers to find legal OA copies of paywalled articles as they browse; Unpaywall currently has over 80,000 active users. The oaDOI codebase is open source, and the service is free and open via an open API.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Accuracy of oaDOI</h4>
+ <p id="p-34">To assess the accuracy of our automated OA determination, a random subsample of 500 articles were chosen from our main “Crossref-DOI†sample, described below. We manually searched the internet for each article in our subsample to determine if the paper was freely available on the publisher’s website, or on another website, such as an institutional repository, an academic social networking site, or on a personal webpage. DOIs were resolved by appending the DOI to “<a class="ext-link" href="https://doi.org/" data-jats-ext-link-type="uri">https://doi.org/</a>â€. If the full text was available through that link, articles were marked as being freely available from the publisher’s site. If articles required a subscription, the title of the article was entered into Google Scholar (GS) and into Google to find alternative versions (i.e., preprints or archived copies). If the fulltext was found on any publisher page or OA repository, these were marked as being freely available from an archive. If the only available open copy was hosted on an academic social network (like Academia.edu or ResearchGate), this was noted but for the sake of the study these were <i>not</i> counted as any category of OA, and were instead added to the “Closed†category;</p>
+ <p id="p-35">The performance of oaDOI is summarized below, compared to these manual accuracy checks. The complete dataset behind this summary is available in supplementary information. Using this data we calculated the recall and precision of the system. “Recall†asks the question, “when an article is open, how often does oaDOI correctly identify it as open?†The recall of the service is 77.0%, meaning that 77% of the truly open articles are correctly identified as open by oaDOI. “Precision†asks the question, “When oaDOI says an article is open, how often is it correct?†The precision of the system is 96.6%, meaning that 96.6% of the time that oaDOI reports an article is open, it really is open.</p>
+ <p id="p-36">These results can be roughly compared to the recall of 86.4% and precision of 99.1% reported by <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> for their automated system. Their accuracy estimate was also calculated based on a sample of 500 data points, giving each estimate a margin of error of ±4.5 percentage points. The Archambault study used a narrower date window for their sample (starting in 1996, versus our Crossref-DOI sample which was not time restricted), resulting in a more homogeneous task, which may partially explain their somewhat better performance.</p>
+ <p id="p-37">The oaDOI service is optimized for high precision, rather than high recall. The very high precision of oaDOI means that any estimates derived from the database can be considered a <i>conservative</i> estimate of the actual percentage of open access in the literature. That is, we can safely assume that when oaDOI reports a certain percentage of open access, the real percentage is <i>at least</i> that high—and almost certainly higher given that recall was less than perfect. Put another way, oaDOI delivers very few false positives (where it mistakenly calls an article open), but a relatively high number of false negatives (where it mistakenly calls an article closed) (<a class="xref xref-table" href="#table-1" data-jats-ref-type="table" data-jats-rid="table-1">Table 1</a>). Future improvements to the system are planned that will improve recall while keeping precision high.</p>
+ <figure class="table-wrap" id="table-1"><div class="caption">
+<span class="caption-label">Table 1: </span>
+ <div class="title">Accuracy of the prototype version of the oaDOI service used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th></th>
+ <th>oaDOI reports Open</th>
+ <th>oaDOI reports Closed</th>
+ <th>Manual count Total (ground truth)</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Open</td>
+ <td>144</td>
+ <td>43</td>
+ <td>187</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>5</td>
+ <td>308</td>
+ <td>313</td>
+ </tr>
+ <tr>
+ <td>Total</td>
+ <td>149</td>
+ <td>351</td>
+ <td style="text-align:left;;">500</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-1</a>
+</div>
+ </figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">Study samples</h3>
+ <p id="p-38">Three samples of DOI-assigned scholarly resources are summarized in <a class="xref xref-table" href="#table-2" data-jats-ref-type="table" data-jats-rid="table-2">Table 2</a> and described further below.</p>
+ <section class="sec">
+ <h4 class="heading">Crossref sample</h4>
+ <p id="p-39">The first sample, “Crossref-DOIs,†is a random sample of 100,000 journal articles with Crossref DOIs, across all publication years. There are approximately 88 million Crossref DOIs in total as of May 2017. In order to exclude books, datasets, and other non-article content, we sampled only items whose “type†was listed as “journal-article†in the Crossref API metadata; there are 66 million of these. To verify the accuracy of Crossref metadata, we manually checked 150 items assigned to type “journal-article,†and determined that 93% were indeed journal articles; the remaining 7% were mostly journal front-matter such as tables of content or instructions to authors.</p>
+ <figure class="table-wrap" id="table-2"><div class="caption">
+<span class="caption-label">Table 2: </span>
+ <div class="title">Summary of samples used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover table-text" data-jats-content-type="text">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Sample name</th>
+ <th>Sample size</th>
+ <th>Population sampled</th>
+ <th>Purpose</th>
+ <th>Population size</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Crossref-DOIs</td>
+ <td>100,000</td>
+ <td>All journal articles with Crossref DOIs, all years.</td>
+ <td>Estimate percentage of the literature that is OA.</td>
+ <td>66,560,153</td>
+ </tr>
+ <tr>
+ <td>WoS-DOIs</td>
+ <td>100,000</td>
+ <td>All citable WoS articles with DOIs, 2009–2015.</td>
+ <td>Estimate citation impact of recent OA papers, and also OA prevalence by discipline.</td>
+ <td>8,083,613</td>
+ </tr>
+ <tr>
+ <td>Unpaywall-DOIs</td>
+ <td>100,000</td>
+ <td>All articles accessed by Unpaywall users over a 1-week period in 2017.</td>
+ <td>Estimate percentage of OA experienced by users of the Unpaywall extension.</td>
+ <td>213,323</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-2</a>
+</div>
+ </figure>
+ <p id="p-40">The purpose of this sample is to roughly proxy the scholarly literature as a whole. As such, it has strengths and weaknesses. One weakness is that although Crossref includes information on citation counts and discipline categorization, we found these to be quite incomplete, and therefore not useful for the present study. Another is that researchers in the scientometrics and OA fields have largely relied on other indexes, particularly Scopus and Web of Science (WoS), to represent the literature as a whole; this makes our results more difficult to compare to previous work. Finally, DOIs are known to be less frequently assigned by publishers in certain disciplines (like humanities; <a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2015.11.008" title="Availability of digital object identifiers (DOIs) in web of science and scopus" data-jats-ref-type="bibr" data-jats-rid="ref-25">Gorraiz et al., 2016</a>), in certain geographic regions (particularly the developing world), and among older articles (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-016-2225-6" title="Availability of digital object identifiers in publications archived by PubMed" data-jats-ref-type="bibr" data-jats-rid="ref-14">Boudry &amp; Chartron, 2017</a>); consequently, these segments will be underrepresented in our sample. This said, Scopus and WoS are also known to underrepresent important segments of the literature (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-015-1765-5" title="The journal coverage of Web of Science and Scopus: a comparative analysis" data-jats-ref-type="bibr" data-jats-rid="ref-37">Mongeon &amp; Paul-Hus, 2016</a>), and so this failing is not limited to Crossref. Moreover, the Crossref sample has important advantages of its own over other indexes. While no sample of the scholarly literature will be complete in every regard, the Crossref index is more expansive than other sources: in July 2017 there were 67 million journal articles indexed in Crossref compared to 30 million in Scopus (<a class="ext-link" href="https://www.elsevier.com/solutions/scopus/content" data-jats-ext-link-type="uri">https://www.elsevier.com/solutions/scopus/content</a>). Also, Crossref has the advantage of being entirely free and open to use, while Scopus and WoS are subscription-access databases; this allows the study data to also be free and open, promoting replication and reuse of our results in further research. However, we did turn to the subscription-access WoS in order to answer questions about the discipline and citation counts of OA articles, since Crossref data is lacking in these areas.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">WoS sample</h4>
+ <p id="p-41">The second sample, “WoS-DOIsâ€, is a random sample of 100,000 journal articles with DOIs that are indexed by Web of Science. The sample was drawn from a local version of the WoS database at the Observatoire des sciences et des technologies (OST) at the Université du Québec à Montréal. Only articles that WoS defines as “citable items†are included in the sample; this excludes non-peer reviewed content such as editorial material and news items. This sample is restricted to articles published between 2009 and 2015, due to DOI availability constraints. The sample of 100,000 articles is randomly drawn from a population of 8 million articles and reviews with a DOI in WoS published between 2009 and 2015 as of May 2017.</p>
+ <p id="p-42">Because the WoS sample is restricted to certain publication years, due to availability of DOIs in the WoS database, this sample is unsuitable for estimating the proportion of the total literature that is OA. However, it is more useful than the Crossref sample in some ways: the WoS sample included accurate discipline information for each article (described below), and also citation counts. Therefore we use the WoS sample to assess OA prevalence by discipline and also the citation impact of recent OA papers. We do not encourage comparisons between the OA percentages in the WoS sample and the Crossref sample, because of large differences in the sampling frames.</p>
+ <p id="p-43">Documents in the WoS-DOIs sample were classified using the National Science Foundation (NSF) journal classification system. This system assigns every journal exactly one “discipline†(a high-level categorization) and exactly one “specialty†(a finer-grained categorization). Because this is a journal-level classification, all articles from a given journal are assigned the same discipline and specialty as the journal. A downside of this approach is that the system classifies multidisciplinary journals (e.g., Nature, PNAS, PLOS ONE) as “biomedical researchâ€, despite their publishing many articles from other fields.<a class="xref xref-fn" href="#fn-5" data-jats-ref-type="fn" data-jats-rid="fn-5"><sup>5</sup></a> In these cases, we used a ground-up, article-by-article classification approach. Each article published in a list of multidisciplinary journals was assigned to the NSF specialty which appeared most frequently in its own reference list. In other words, papers published in multidisciplinary journals were classified at the article level (instead of at the journal level) to the subject area which they cite most frequently.<a class="xref xref-fn" href="#fn-6" data-jats-ref-type="fn" data-jats-rid="fn-6"><sup>6</sup></a> </p>
+ <p id="p-46">We assess the relative impact of open and closed articles, using citations as an indicator of their scholarly impact. There are several properties of articles, however, that can confound this kind of comparison. Chief among these are the article’s discipline (some fields are much more cited than others) and its age (older articles have had more time to gather citations). In order to address this, we computed a normalized expected number of citations for each article, based on its age and its NSF specialty, by comparing it to the average citations for similar articles.<a class="xref xref-fn" href="#fn-7" data-jats-ref-type="fn" data-jats-rid="fn-7"><sup>7</sup></a> </p>
+ <p id="p-48">Using this approach, each article receives an average relative citation (ARC). An ARC of 1.0 indicates that a document was cited according to expectations based on documents published in the same year and NSF specialty, while an ARC above or below 1.0 indicates that the citation impact was above or below world average, respectively. Using these field-normalized citation rates, citation impact can be compared across scientific disciplines as well as across years. We can also compute mean ARCs for groups of articles, like “all open articles†or “all closed articlesâ€, allowing us to compare normalized impact between these two groups. Analyzing results on the level of NSF disciplines, data is not shown for the Humanities (<i>n</i> = 1,091) and Arts (<i>n</i> = 164), because they are underrepresented both in the Web of Science and in terms of DOI coverage.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Unpaywall sample</h4>
+ <p id="p-49">The third sample, “Unpaywall-DOIsâ€, is a random sample of 100,000 articles accessed by users of the free, open-source Unpaywall browser extension, gathered over a one-week time window. We collected IP addresses and DOI requests made to the oaDOI service through the Unpaywall browser extension during the week of June 5–June 11, 2017. In that time period there were 374,703 total accesses, 213,323 unique DOIs, and 42,894 unique IP addresses gathered in total, from which 100,000 unique DOIs were randomly sampled.</p>
+ <p id="p-50">This sample was used to assess the prevalence of OA experienced by users of the Unpaywall extension (since Unpaywall uses oaDOI data to find OA). It is a convenience sample of what articles people are interested in reading, and thereby lets us roughly estimate the percent of this literature that is OA. The sample has serious limitations, however: we don’t know the demographics of Unpaywall users, and we are aware of a bias towards users from the US (as determined by the IP addresses). As such, we cannot accurately generalize the results by education level, discipline, or purpose in reading the scholarly literature.</p>
+ </section>
+ </section>
+ </section>
+ <section class="sec" id="results">
+ <h2 class="heading">Results</h2>
+ <section class="sec">
+ <h3 class="heading">RQ1. What percent of the literature is open access?</h3>
+ <section class="sec">
+ <h4 class="heading">How much of the literature is OA?</h4>
+ <p id="p-51">We found 27.9% (95% CI [27.6–28.2]) of all DOI-assigned journal articles are OA, using the Crossref-DOI sample. Based on this, we estimate there are 18.6 million OA articles with Crossref DOIs (95% CI [18.4–18.8]). This is the total population of OA articles that can be identified and accessed by oaDOI. Given our finding (described in Methods above) that the oaDOI service finds 77% of OA compared to manual searches, we can further estimate that an additional 3.5 million articles are OA but not detectable by this version of oaDOI.</p>
+ <p id="p-52">People reading the literature using the Unpaywall browser extension encounter a significantly higher proportion of OA: we found that 47.0% (95% CI [46.7–47.3]) of the Unpaywall-accessed sample is open access. The main reason for this is article age: since this sample is based on the behavior of actual readers, it is disproportionately comprised of recent articles. In fact, half the accessed articles were published in the last 2 years. Recent articles are much more likely to be OA than their older counterparts (see Results ‘How does Open Access vary by year of publication?’ below).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">What types of Open Access are most common?</h4>
+ <p id="p-53">The proportion of OA by subtype is relatively similar across the samples, as shown in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a> and <a class="xref xref-table" href="#table-3" data-jats-ref-type="table" data-jats-rid="table-3">Table 3</a>. Green OA represents a relatively small percentage of OA articles in all three samples. This is partly because self-archived articles are only counted as Green where there is no publisher-hosted option available; that is, Green OA is sometimes “shadowed†by Gold, Bronze, or Hybrid articles. Bronze is the most common OA subtype in all the samples, which is particularly interesting given that few studies have highlighted its role. We manually inspected a small sample of Bronze articles in order to understand this subcategory more; we found that while many Bronze articles were Delayed OA from toll-access publishers, nearly half were hosted on journals that published 100% of content as free-to-read but were <i>not</i> listed on the DOAJ and did not formally license content (using CC-BY or any other license). Such journals might be better described as “Dark Gold†or “Hidden Gold†than Bronze. A more complete examination of Bronze falls outside the scope of this study, and therefore further investigation will be undertaken in future work.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-1"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 1: Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-small.jpg 355w" data-image-id="fig-1" alt="Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="230"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 1: </span>Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" class="btn btn-mini" download="peerj-4375-fig-1.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-1</a>
+</div>
+</div></figcaption></figure>
+ <figure class="table-wrap" id="table-3"><div class="caption">
+<span class="caption-label">Table 3: </span>
+ <div class="title">Percent of the literature that is OA, by type, in three samples of 100,000 journal articles, with 95% confidence intervals.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Access type</th>
+ <th style="text-align:center;" colspan="2">Crossref-DOI All journal articles with Crossref DOIs, all years. (“Articles with DOIs†in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a>)</th>
+ <th style="text-align:center;" colspan="2">WoS-DOIs All citable WoS articles with DOIs, 2009–2015</th>
+ <th style="text-align:center;" colspan="2">Unpaywall-DOIs All articles accessed by Unpaywall users over a 1-week period in 2017</th>
+ </tr>
+ <tr>
+ <th></th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>OA (all types)</td>
+ <td>27.9%</td>
+ <td>27.6–28.2</td>
+ <td>36.1%</td>
+ <td>36.0–36.2</td>
+ <td>47.0%</td>
+ <td>46.7–47.3</td>
+ </tr>
+ <tr>
+ <td>Bronze OA</td>
+ <td>16.2%</td>
+ <td>16.0–16.5</td>
+ <td>12.9%</td>
+ <td>12.6–13.2</td>
+ <td>15.3%</td>
+ <td>15.0–15.6</td>
+ </tr>
+ <tr>
+ <td>Hybrid OA</td>
+ <td>3.6%</td>
+ <td>3.3–3.9</td>
+ <td>4.3%</td>
+ <td>4.0–4.6</td>
+ <td>8.3%</td>
+ <td>8.0–8.6</td>
+ </tr>
+ <tr>
+ <td>Gold OA</td>
+ <td>3.2%</td>
+ <td>2.9–3.5</td>
+ <td>7.4%</td>
+ <td>7.1–7.7</td>
+ <td>14.3%</td>
+ <td>14.0–14.6</td>
+ </tr>
+ <tr>
+ <td>Green OA</td>
+ <td>4.8%</td>
+ <td>4.5–5.1</td>
+ <td>11.5%</td>
+ <td>11.2–11.8</td>
+ <td>9.1%</td>
+ <td>8.8–9.4</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>72.0%</td>
+ <td>71.8–72.4</td>
+ <td>63.9%</td>
+ <td>63.8–64.0</td>
+ <td>53.0%</td>
+ <td>52.7–53.3</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-3</a>
+</div>
+ </figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by year of publication?</h4>
+ <p id="p-54"><a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Figure 2</a> presents the number (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2A</a>) and proportion (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2B</a>) of papers by access category and publication date. Articles published in the last 20 years are increasingly OA, and this trend shows no sign of slowing. More recent articles are more likely to be OA, with the most recent year examined also containing the most OA: 44.7% of 2015 articles are OA (95% CI [43.3–46.2%]), including 17.6% Bronze (95% CI [16.2–19.1]), 9.4% Hybrid (95% CI [8.0–10.9]), 11.3% Gold (95% CI [9.9–12.8]), and 6.3% Green (95% CI [4.9–7.8]). Well over one million OA papers were published in 2015. This growth trend has largely been driven by dramatic growth in Gold and Hybrid OA since the year 2000. However, more than 20% of papers published before the digital age are also freely available. The majority of these older OA papers are Bronze, and based on their age they are probably more precisely Delayed OA, although additional investigation will be required to confirm this. Bronze OA remains remarkably constant as a proportion of the literature for all publication years examined.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-2"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 2: Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-small.jpg 355w" data-image-id="fig-2" alt="Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="216"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 2: </span>Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" class="btn btn-mini" download="peerj-4375-fig-2.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-2</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-55">The number and proportion of Green papers must be interpreted with particular caution, due to several factors. First, unlike publisher-hosted OA (Gold, Bronze, and Hybrid), the date when the Green article <i>became open</i> is generally different from the date the article was <i>first published</i>. Authors often self-archive articles years after (or before, in the case of preprints) their original publication, leading to so-called “backfilling†of Green stocks (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>). Consequently, the graph cannot show the growth of Green OA over time; this would require longitudinal analysis over several years, and so is outside the scope of this analysis. Instead it shows the number and proportion of Green OA by publication year of the article. Second, many articles cannot be legally self-archived until a certain number of months after publication; this embargoing likely influences the apparent plateau in Green shown in <a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2</a>. Finally, as noted earlier, many self-archived articles would otherwise be Green except for being “shadowed†by a Gold, Bronze, or Hybrid of the same article elsewhere. For more detail on the growth of shadowed Green OA, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Figs. SA2</a> and <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">SA3</a>.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by publisher?</h4>
+ <p id="p-56">We analyzed a subset of the Crossref-DOIs sample by publisher (as listed on the Crossref metadata record) to understand how the extent and types of OA are common across publishers for recent publications (between 2009 and 2015). As we can see in <a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3A</a>, the largest publishers by volume publish the most OA articles by volume, led by Elsevier. As a proportion of all articles published (<a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3B</a>), however, PLOS and Hindawi distinguish themselves as being the only publishers in the top 20 with 100% OA. More than half of the papers published by Oxford University Press, Nature Publishing Group, IOP Publishing, and the American Physical Society (APS) are freely available online. In the case of APS this is largely driven by content available through repositories such as arXiv (for more details on repositories, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>).</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-3"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 3: Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-small.jpg 355w" data-image-id="fig-3" alt="Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="282"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 3: </span>Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" class="btn btn-mini" download="peerj-4375-fig-3.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-3</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary across disciplines?</h4>
+ <p id="p-57">We used the WoS-DOIs sample to examine OA prevalence differences by discipline, because of the easy availability of discipline metadata in the WoS index. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> displays our results. More than half of the publications are freely available in biomedical research and mathematics, while in chemistry and engineering &amp; technology less than 20% of the papers are freely available. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> also highlights the popularity of Green OA in disciplines like physics and mathematics, where more than one fifth of papers are available only through online repositories (mainly arXiv). Hybrid articles are particularly prevalent in mathematics (9.4%), biomedical research (8.1%) and clinical medicine (6.3%), while authors in biomedical research (15.3%), health (11.7%), mathematics (11.2%) and clinical medicine (10.3%) often publish in Gold journals.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-4"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 4: Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-small.jpg 355w" data-image-id="fig-4" alt="Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="241"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 4: </span>Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities).</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" class="btn btn-mini" download="peerj-4375-fig-4.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-4" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-4</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-58">Large variations can also be observed on the more detailed level of NSF specialties (<a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA5</a>). At more than 80% of OA articles, astronomy &amp; astrophysics (87%), fertility (86%), tropical medicine (84%), and embryology (83%) were the specialties where access to literature was the most open. At the other end of the spectrum are pharmacy (7%), inorganic &amp; nuclear chemistry (7%), and chemical engineering (9%), where publications were hidden behind a paywall for more than 90% of papers. More detail on these and other NSF specialties can be seen in <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>.</p>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">RQ2. What is the scholarly impact of open access?</h3>
+ <p id="p-59">Comparing the average relative citation impact of different access categories, the OACA is corroborated: Papers hidden behind a paywall were cited 10% below world average (ARC = 0.90), while those that are freely available obtain, on average, 18% more citations than what is expected (ARC = 1.18). However, citation impact differs between the different manners in which papers are made available for free: those that are only available as Green OA (ARC = 1.33) and Hybrid OA papers (ARC = 1.31) are cited the most with an impact of more than 30% above expectations, those available as Bronze are cited 22% above world average, while papers published as Gold OA obtain an ARC of 0.83. This constitutes an average relative citation impact of 17% below world average and 9% below that of articles hidden behind a paywall. <a class="xref xref-fig" href="#fig-5" data-jats-ref-type="fig" data-jats-rid="fig-5">Figure 5</a> below describes these findings.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-5"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 5: Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w" data-image-id="fig-5" alt="Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="388"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 5: </span>Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" class="btn btn-mini" download="peerj-4375-fig-5.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-5" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-5</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-60">These trends vary over time, however, as shown in <a class="xref xref-fig" href="#fig-6" data-jats-ref-type="fig" data-jats-rid="fig-6">Fig. 6</a>. While the ARC of closed access papers remains below world average throughout the period studied, it increased from .86 in 2009 to .93 over in 2014 and 2015. Meanwhile, when looking across all open types, the mean citation rate is consistently above the world average, fluctuating between 1.15 and 1.22. This fluctuation is guided by differences between the access types, with the impact of Hybrid OA papers increasing over the time period. While Green OA papers’ mean citation rate remain relatively stable, the highest impact, for 2015, is obtained by Bronze and Hybrid. The only form of open for which mean impact has decreased steadily over time is Gold. The results for more recent years are only based on a short citation window, however, and results might change over the next years as citations accumulate.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-6"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 6: Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-small.jpg 355w" data-image-id="fig-6" alt="Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="465"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 6: </span>Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" class="btn btn-mini" download="peerj-4375-fig-6.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-6" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-6</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Discussion and Conclusion</h2>
+ <p id="p-61">Access to scholarly literature is at the heart of current debates in the research community. Research funders are increasingly mandating OA dissemination to their grantees while, at the same time, the growth in toll-access subscriptions costs have prompted more and more university libraries to cancel subscriptions. In this context, several tools have been developed to provide access–both legally and illegally–to scholarly literature. Using data from one of these tools (oaDOI), this paper addresses two broad research questions: what percent of the literature is OA and how does it vary by type of OA, and what is the mean scholarly impact of papers diffused through this form. Three large samples were used, to assess different aspects of OA patterns: (1) 100,000 articles that have a Crossref DOIs, which allows us to assess the relative proportion of OA across all existing literature; (2) 100,000 WoS-indexed journals articles that have a DOI, which allows us to assess the scholarly impact of OA and non OA papers; (3) 100,000 articles accessed by users through the Unpaywall browser extension, which lets us assess the proportion of OA papers found by users of this free tool.</p>
+ <p id="p-62">We found that 28% of all journal articles are freely available online (Crossref-DOI sample). Encouragingly for proponents of OA, this proportion has been growing steadily over the last 20 years, driven particularly by growth in Gold and Hybrid. Articles from 2015, the most recent year examined, had the highest proportion OA (45%), as well as the largest absolute number of OA articles published in a single year. This disproportionate level of OA in recent years, combined with readers’ preference for more recent articles, leads to a felicitous situation for readers: the proportion of OA they <i>experience</i> as they browse and search is better than the overall percentage of OA across the literature as a whole. Users of the Unpaywall browser extension, which gives individual readers access to the oaDOI service, encounter OA articles nearly half (47%) of the time. The effect almost certainly extends beyond Unpaywall users; one may assume readers in general also favor newer articles, and therefore benefit from the growth of Gold, Bronze, and Hybrid OA among recent papers, even without using Unpaywall. More studies of readership data from other sources would be useful to quantify this further.</p>
+ <p id="p-63">Interestingly, we found that the majority of OA articles are Bronze–hosted on publisher websites, either without a license at all or without an open license. This is surprisingly high given that Bronze is relatively little-discussed in the OA literature, and suggests that this OA category deserves further attention from the OA community. In particular, Bronze OA may be significant in a policy context, since, unlike other publisher-hosted OA, Bronze articles do not extend any reuse rights beyond reading, making them Gratis OA. Much more research is needed into the characteristics of Bronze OA. How many Bronze articles are licensed openly, but do not make their license available? Is Bronze disproportionately non-peer-reviewed content? How much of Bronze OA is also Delayed OA? How much Bronze is Promotional, and how transient is the free-to-read status of this content? How many Bronze articles are published in “hidden gold†journals that are not listed in the DOAJ? Why are these journals not defining an explicit license for their content, and are there effective ways to encourage this? These and other questions are outside the scope of this study but may provide fruitful insights for future OA research and policy.</p>
+ <p id="p-64">Only about 7% of the literature overall (and 17% of the OA literature) is Green. This is may at first seem disappointing, given years of advocacy focused on Green OA as well as ongoing growth in the number of Green OA mandates (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014" title="Anatomy of green open access" data-jats-ref-type="bibr" data-jats-rid="ref-11">Björk et al., 2014</a>). However, the full context of Green OA provides reasons for optimism. First, many papers are archived in repositories but are not counted as Green in this analysis because they are also available on the publisher site as Hybrid, Gold, or Bronze versions. These “shadowed Green†copies provide a useful safety net that preserves access in cases where publishers rescind it (as could potentially happen with Delayed OA and other Bronze articles). Further research is needed to determine the prevalence of shadowed Green OA in various disciplines. Second, the phenomenon of “backfilling†(authors self-archiving content published across all years, not just the current one) means that although the percentage graph of Green OA does not show the same year-over-year slope as Gold or Hybrid, the line itself may be rising across <i>all</i> years as authors gradually self-archive papers from years or even decades ago. This assumption is supported by results reported by <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. Finally, the relatively low proportion of green OA encouragingly leaves room for continued growth. While most journals published by major publishers (Elsevier, Wiley, Springer, etc.) allow for self-archiving, research shows that only a small proportion of papers from these publishers actually are self-archived in OA repositories; for example, <a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=" title="Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature" data-jats-ref-type="bibr" data-jats-rid="ref-42">Smith et al. (in press)</a> report using a sample of Global Health Research papers that only 39% of them made use of available self-archiving rights.</p>
+ <p id="p-65">Our results confirm the Open Access Citation Advantage found by other studies: open articles receive 18% more citations than otherwise expected. While at least some of this boost is likely due to the fact that more access allows more people to read and hence cite articles they otherwise would not, causation is difficult to establish and there are many possible confounders. Most discussed is the so-called “selection bias postulateâ€, (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2007.04.001" title="Do open access articles have greater citation impact?" data-jats-ref-type="bibr" data-jats-rid="ref-18">Craig et al., 2007</a>) which suggests that authors choose only their most impactful work to make OA. The current study does not examine the cause or directionality of correlation, but does find that it exists in a very large sample that is relatively representative of the literature as a whole. Funder requirements may also play a role in the observed citation advantage: high-profile funders are more likely to have an OA publishing requirement; at the same time, well funded studies are independently more likely to receive more citations than poorly funded studies (<a class="xref xref-bibr" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/" title="Measuring the scientific output and impact of NIGMS grants" data-jats-ref-type="bibr" data-jats-rid="ref-7">Berg, 2010</a>). Interestingly, Gold articles are actually cited <i>less</i>, likely due to an increase in the number of newer and smaller OA journals. Some of these journals are from regions of the world not historically indexed by WoS, are published in languages other than English, or might be considered to be less prestigious because they have not had time to become established or accumulate citations (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>). On the flip side, the citation disadvantage of Gold OA is likely also affected by the continued growth of so-called ‘mega journals’ such as PLOS ONE (<a class="xref xref-bibr" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication" title="Reviewer guidelines: criteria for publication" data-jats-ref-type="bibr" data-jats-rid="ref-40"> PLOS, 2018</a>). Whatever the reason, the lower impact of Gold means the overall citation advantage is strongly driven by Green, Hybrid, and Bronze content. In sum, while several factors can affect the observed differences in citation rates, and causation remains difficult to establish, the fact remains that scholars are much more likely to read and cite papers to which they have access than those that they cannot obtain. Hopefully the existence of a free, open index of OA content will help support further research into the OACA question.</p>
+ <p id="p-66">The relatively high percentage of OA found in this study, particularly among readers of the free Unpaywall extension, has important potential implications for academic libraries. Increasingly, these libraries are under pressure to meet growing prices of “Big Deal†subscription packages, and the once-unthinkable outcome of canceling these Big Deals is becoming an increasingly realistic option. In this environment, knowing that around half of the literature of interest is available without any subscription may tip the scales toward cancellation for some institutions–particularly given that this percentage seems to be growing steadily. Indeed, the Université de Montréal’s cancellation of their Taylor &amp; Francis subscription package (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>) is particularly interesting, given that their cancellation announcement directly pointed faculty to Unpaywall and other tools to help them access OA content. This may seem a radical suggestion, but cancellation of subscription journals has long been part of the universal OA roadmap (<a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/" title="The forbidden forecast: thinking about open access and library subscriptions" data-jats-ref-type="bibr" data-jats-rid="ref-2">Anderson, 2017b</a>). Even when the percentage of OA is not enough to support outright cancellation, it may be enough to negotiate better subscription rates by supporting calculation of “OA-adjusted Cost Per Access†(<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>). However, much more study is needed to see how OA availability varies across journals and Big Deal packages, along with praxis-oriented work building OA analysis tools that help librarians make cancellation choices.</p>
+ <p id="p-67">This study has several important limitations. Our dataset only includes journal articles with DOIs, which means that disciplines and geographical areas which rely more heavily on conference papers or articles without DOIs are underrepresented. Our Crossref sample includes about 7% journal “front matter†that the journal has assigned a DOI and Crossref labelled “journal article†but is actually a page describing the journal Editorial Board or similar. Our Bronze OA category includes articles published in OA journals which aren’t indexed in DOAJ; future work must identify these OA journals and classify such articles as Gold. As discussed in our definition of OA, when finding open copies we ignored free-to-read articles from academic social networks like ResearchGate and Academia.edu. The oaDOI system has some coverage of articles published on personal web pages, but this is quite limited compared to web-scale indexes like Google. The oaDOI system includes thousands of institutional and subject repositories, but there are some repositories that it misses. Our accuracy checks suggest that oaDOI, and therefore this study, are probably overlooking around 23% of OA otherwise discoverable using web searches, meaning that estimates in reported in this paper undercount OA by approximately 30%. Finally, our approach did not detect <i>when</i> articles were deposited into repositories. Because repositories are often backfilled with content that has been published many years ago, this study does not measure any increase/decrease in prevalence of Green OA over time, but only the proportion of Green OA by article publication date at the moment of data collection.</p>
+ <p id="p-68">In addition to the empirical results obtained, this paper clearly shows the potential of the oaDOI service for future research. The freely available oaDOI service provides scholars with the basis for assessing and monitoring the development of access to scholarly literature on a large scale, as well as the factors that affect it. For instance, our results show that the percentage of the literature available as OA is growing, and that articles diffused through this form are generally more cited than closed access articles. Several factors are likely to contribute to these trends; however, those remain poorly understood. Combined with other datasets–such as the WoS, Scopus, or Crossref–oaDOI allows one to assess at a large-scale the effects of various mandates on deposit rates, or to track the development of documents’ accessibility to determine, for example, when authors self-archive, or the sustainability of the promotional OA category. Aggregated at the level of journals and publishing platforms, these data can also provide librarians with indicators to help inform subscription cancellations and mitigate their effects. The application of the oaDOI algorithm on a large scale also allows for more complete analysis of the OA citation advantage across fields and time. As in <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0013636" title="Self-selected or mandated, open access increases citation impact for higher quality research" data-jats-ref-type="bibr" data-jats-rid="ref-23">Gargouri et al. (2010)</a>, confounding factors could be mitigated by using article-level metadata to identify article pairs published in the same journal issue, on the same topic or published by the same authors at the same time. We hope that other scholars will dig deeper in those data to better understand OA dissemination and the factors that drive it. This is of utmost importance for the future of scholarly communication.</p>
+ </section>
+ <section class="sec" id="supplemental-information">
+ <h2 class="heading"> Supplemental Information</h2>
+ <div class="supplementary-material well well-small" id="supp-1" data-jats-mimetype="application" data-jats-mime-subtype="vnd.openxmlformats-officedocument.wordprocessingml.document">
+<h3 class="heading">Additional results</h3>
+
+ <div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/supp-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/supp-1</a>
+</div>
+<div><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/appendix.docx" class="btn article-supporting-download" data-rel="supplement" download="appendix.docx" data-filename="appendix.docx"><i class="icon-large icon-download-alt"> </i> Download</a></div>
+</div>
+ </section>
+ </div>
+<div id="article-footnotes">
+<div class="fn article-footnote" id="fn-1"><span class="p">In the interest of full disclosure, it should be noted that two of the authors of the paper are the co-founders of Impactstory, the non-profit organization that developed oaDOI.</span></div>
+<div class="fn article-footnote" id="fn-2"><span class="p">Repositories that were included are those covered by the Bielefeld Academic Search Engine (BASE) in May 2017. A full listing of repositories can be found on their website at: <a class="ext-link" href="https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1</a>
+ </span></div>
+<div class="fn article-footnote" id="fn-3"><span class="p">DOIs are short, unique identifiers for scholarly papers. Crossref is a nonprofit that helps a the DOI system, and is by far the largest supplier of academic DOIs in academia.</span></div>
+<div class="fn article-footnote" id="fn-4"><span class="p">Based on a Sci-Hub dataset released in 2016 (the most recent data available).</span></div>
+<div class="fn article-footnote" id="fn-5"><span class="p">These journals were identified by selecting journals with over a one thousand articles per year from those classified in the general “biomedical research†category. The full list of journals meeting these criteria were: PLOS ONE, Nature, Science, Scientific Reports, PNAS, Nature Communication, PeerJ, and Science Advances.</span></div>
+<div class="fn article-footnote" id="fn-6"><span class="p">Ties between frequently cited specialties were resolved randomly; that is, if a paper cites exactly the same amount of papers from two NSF specialties, it was assigned to one of the two at random</span></div>
+<div class="fn article-footnote" id="fn-7"><span class="p">Citations were normalized using the population of WoS articles and reviews with a DOI.</span></div>
+</div></main><footer class="back">
+ <section class="ack" id="acknowledgements"><h2 class="heading">Acknowledgements</h2>
+ <p>The authors would like to thank Dorothea Salo, Kristin Antelman, and John Sack for extensive and valuable comments on a draft of this article. The author order of JP and HP was determined by coin flip, as is their custom.</p>
+ </section>
+ <div class="sec" id="additional-information">
+ <h2 class="heading">Additional Information and Declarations</h2>
+ <div class="fn-group" data-jats-content-type="competing-interests">
+ <h3 class="heading">Competing Interests</h3>
+<div class="fn" id="conflict-1" data-jats-fn-type="conflict"><p>Heather Piwowar and Jason Priem are founders of Impactstory, a non-profit company which makes Unpaywall, oaDOI, and other tools to improve scholarly communication.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="author-contributions">
+ <h3 class="heading">Author Contributions</h3>
+<div class="fn" id="contribution-1" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-1" data-jats-ref-type="contrib" data-jats-rid="author-1">Heather Piwowar</a>, <a class="xref xref-contrib" href="#author-2" data-jats-ref-type="contrib" data-jats-rid="author-2">Jason Priem</a> and <a class="xref xref-contrib" href="#author-9" data-jats-ref-type="contrib" data-jats-rid="author-9">Stefanie Haustein</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, prepared figures and/or tables, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-2" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-3" data-jats-ref-type="contrib" data-jats-rid="author-3">Vincent Larivière</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-3" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-4" data-jats-ref-type="contrib" data-jats-rid="author-4">Juan Pablo Alperin</a> conceived and designed the experiments, performed the experiments, analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-4" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-5" data-jats-ref-type="contrib" data-jats-rid="author-5">Lisa Matthias</a> performed the experiments, analyzed the data, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-5" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-6" data-jats-ref-type="contrib" data-jats-rid="author-6">Bree Norlander</a> analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-6" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-7" data-jats-ref-type="contrib" data-jats-rid="author-7">Ashley Farley</a> wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-7" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-8" data-jats-ref-type="contrib" data-jats-rid="author-8">Jevin West</a> reviewed drafts of the paper.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="other">
+ <h3 class="heading">Data Availability</h3>
+<div class="fn" id="addinfo-1">
+<p>The following information was supplied regarding data availability:</p>
+ <p>Zenodo: <a class="ext-link" href="http://doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://doi.org/10.5281/zenodo.837902</a>.</p>
+ <p>The datasets behind the analysis in this paper are openly available at <a class="ext-link" href="http://dx.doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://dx.doi.org/10.5281/zenodo.837902</a> and the R statistics code can be found at <a class="ext-link" href="https://github.com/Impactstory/oadoi-paper1" data-jats-ext-link-type="uri">https://github.com/Impactstory/oadoi-paper1</a>. The oaDOI code is open source at <a class="ext-link" href="https://github.com/impactstory/oadoi" data-jats-ext-link-type="uri">https://github.com/impactstory/oadoi</a> and information about accessing the oaDOI API and full dataset is at <a class="ext-link" href="https://oadoi.org/api" data-jats-ext-link-type="uri">https://oadoi.org/api</a>.</p>
+</div>
+</div>
+ <h3 class="heading">Funding</h3>
+<p>The authors received no funding for this work.</p>
+</div>
+ <section class="ref-list-container" id="references"><h2 class="heading">References</h2>
+<ul class="ref-list" data-jats-content-type="authoryear">
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-1">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017a</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">When the wolf finally arrives: big deal cancelations in North American Libraries</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2018-01-09">09 January 2018</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-2">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017b</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">The forbidden forecast: thinking about open access and library subscriptions</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-07-15">15 July 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-3">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Antelman</span> <span class="given-names" itemprop="givenName">K</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf">Leveraging the growth of open access in library collection decision making</a>.</cite> In: <span itemprop="name"><a class="conf-name" target="_blank" href="https://scholar.google.com/scholar_lookup?title=Proceeding%20from%20ACRL%202017:%20at%20the%20helm:%20leading%20transformation&amp;author=&amp;publication_year=2017">Proceeding from ACRL 2017: at the helm: leading transformation</a>.</span><span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-4">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <span class="article-title"> <span class="source">Proportion of open access peer-reviewed papers at the European and world levels–2004–2011</span>. </span><span class="institution">European Commission, Brussels</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-5">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">AF</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <span class="article-title"> <span class="source">Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013</span>. </span><span class="institution">European Commission</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-6">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Côté</span> <span class="given-names" itemprop="givenName">G</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Struck</span> <span class="given-names" itemprop="givenName">B</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Voorons</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom">Research impact of paywalled versus open access papers</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-7">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Berg</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">Measuring the scientific output and impact of NIGMS grants</a>.</cite> <span> <span class="comment">NIGMS Feedback Loop Blog [Blog post]. <a class="uri" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/</a>
+ </span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-8">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2016a</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2016.08.002">Hybrid open access—a longitudinal study</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">919</span>-<span class="lpage" itemprop="pageEnd">932</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-9">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B-C</span></span>.</b> <b class="year" itemprop="datePublished">2016b</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1021">The open access movement at a crossroad: are the big publishers and academic social media taking over?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">29</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">131</span>-<span class="lpage" itemprop="pageEnd">134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-10">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1096">Gold, green, and black open access</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">30</b></span>:<span class="fpage" itemprop="pageStart">173</span>-<span class="lpage" itemprop="pageEnd">175</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-11">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paetau</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014">Anatomy of green open access</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Association for Information Science and Technology</span></span> <b itemprop="volumeNumber">65</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">237</span>-<span class="lpage" itemprop="pageEnd">250</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-12">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Majlender</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Guðnason</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0011273">Open access to the scientific journal literature: situation 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e11273</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-13">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bohannon</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1126%2Fscience.352.6285.508">Who’s downloading pirated papers? Everyone</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science</span></span> <b itemprop="volumeNumber">352</b></span>(<span itemprop="issueNumber">6285</span>)</span>:<span class="fpage" itemprop="pageStart">508</span>-<span class="lpage" itemprop="pageEnd">512</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-14">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Boudry</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chartron</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-016-2225-6">Availability of digital object identifiers in publications archived by PubMed</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics March</span></span> <b itemprop="volumeNumber">110</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">1453</span>-<span class="lpage" itemprop="pageEnd">1469</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-15">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chawla</span> <span class="given-names" itemprop="givenName">D</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement">Publishers take ResearchGate to court, alleging massive copyright infringement</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science News</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-16">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F19322909.2013.795426">Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Web Librarianship</span></span> <b itemprop="volumeNumber">7</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">243</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-17">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Olijhoek</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2016.1182672">Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">42</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">108</span>-<span class="lpage" itemprop="pageEnd">115</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-18">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Craig</span> <span class="given-names" itemprop="givenName">ID</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Plume</span> <span class="given-names" itemprop="givenName">AM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McVeigh</span> <span class="given-names" itemprop="givenName">ME</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Pringle</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amin</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2007</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2007.04.001">Do open access articles have greater citation impact?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">1</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">239</span>-<span class="lpage" itemprop="pageEnd">248</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-19">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Creative Commons</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://creativecommons.org/licenses/by/4.0/">Attribution 4.0 International (CC BY 4.0)</a></cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-20">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1096%2Ffj.11-183988">Open access, readership, citations: a randomized controlled trial of scientific journal publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">FASEB Journal</span></span> <b itemprop="volumeNumber">25</b></span>:<span class="fpage" itemprop="pageStart">2129</span>-<span class="lpage" itemprop="pageEnd">2134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-21">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walters</span> <span class="given-names" itemprop="givenName">WH</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.3163%2F1536-5050.99.3.008">The impact of free access to the scientific literature: a review of recent research</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">99</b></span>:<span class="fpage" itemprop="pageStart">208</span>-<span class="lpage" itemprop="pageEnd">217</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-22">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fortney</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gonder</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <span class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html">A social networking site is not an open access repository</a>. <span class="source">Office of Scholarly Communication</span>. </span><span class="institution">University of California</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-23">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0013636">Self-selected or mandated, open access increases citation impact for higher quality research</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">10</span>)</span>:<span class="fpage" itemprop="pageStart">e13636</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-24">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/1206.3664">Green and gold open access percentages and growth, by discipline</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-25">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gorraiz</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Melero-Fuentes</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gumpenbergera</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Valderrama-Zuriánc</span> <span class="given-names" itemprop="givenName">J-C</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2015.11.008">Availability of digital object identifiers (DOIs) in web of science and scopus</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">98</span>-<span class="lpage" itemprop="pageEnd">109</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-26">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greshake</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.11366.1">Looking into Pandora’s Box: the content of <i>Sci-Hub</i> and its usage [version 1; referees: 2 approved, 2 approved with reservations]</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000Research</span></span> <b itemprop="volumeNumber">6</b></span> <span class="comment">Article 541</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-27">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>.</b> <b class="year" itemprop="datePublished">2006</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/cs/0606079">Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-28">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Vallières</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hitchcock</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Oppenheim</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hilf</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2008.10765150">The access/impact problem and the green and gold roads to open access: an update</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">34</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">36</span>-<span class="lpage" itemprop="pageEnd">40</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-29">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Himmelstein</span> <span class="given-names" itemprop="givenName">DS</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Romero</span> <span class="given-names" itemprop="givenName">AR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McLaughlin</span> <span class="given-names" itemprop="givenName">SR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tzovaras</span> <span class="given-names" itemprop="givenName">BG</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greene</span> <span class="given-names" itemprop="givenName">CS</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1">Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)</a></cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ Preprints</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-30">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jamali</span> <span class="given-names" itemprop="givenName">HR</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-017-2291-4">Copyright compliance and infringement in ResearchGate full-text journal articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">112</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">241</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-31">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1186%2F1741-7015-10-124">Anatomy of open access publishing: a study of longitudinal development and internal structure</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Medicine</span></span> <b itemprop="volumeNumber">10</b></span> <span class="comment">Article 124</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-32">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fasi.22856">Delayed open access: an overlooked high-impact category of openly available scientific literature</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the American Society for Information Science and Technology</span></span> <b itemprop="volumeNumber">64</b></span>(<span itemprop="issueNumber">7</span>)</span>:<span class="fpage" itemprop="pageStart">1323</span>-<span class="lpage" itemprop="pageEnd">1329</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-33">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bukvova</span> <span class="given-names" itemprop="givenName">H</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nyman</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0020961">The development of open access journal publishing from 1993 to 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">6</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e20961</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-34">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Matsubayashi</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kurata</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Sakai Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mine</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ueda</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009">Status of open access in the biomedical field in 2005</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">97</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">4</span>-<span class="lpage" itemprop="pageEnd">11</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-35">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McCabe</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Snyder</span> <span class="given-names" itemprop="givenName">C</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1111%2Fecin.12064">Identifying the effect of open access on citations using a panel of science journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Economic Inquiry</span></span> <b itemprop="volumeNumber">52</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">1284</span>-<span class="lpage" itemprop="pageEnd">1300</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-36">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McKiernan</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bourne</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brown</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Buck</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kenall</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Lin</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McDougall</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nosek</span> <span class="given-names" itemprop="givenName">BA</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ram</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soderberg</span> <span class="given-names" itemprop="givenName">CK</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName"> Spies</span> <span class="given-names" itemprop="givenName"> JR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Updegrove</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Woo</span> <span class="given-names" itemprop="givenName">KH</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Yarkoni</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rodgers</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7554%2FeLife.16800">How open science helps researchers succeed</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">eLife</span></span> <b itemprop="volumeNumber">5</b></span>:<span class="elocation-id" itemprop="pageStart">e16800</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-37">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paul-Hus</span> <span class="given-names" itemprop="givenName">A</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-015-1765-5">The journal coverage of Web of Science and Scopus: a comparative analysis</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">106</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">213</span>-<span class="lpage" itemprop="pageEnd">228</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-38">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ottaviani</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0159614">The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">11</b></span>(<span itemprop="issueNumber">8</span>)</span>:<span class="fpage" itemprop="pageStart">e0159614</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-39">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Packer</span> <span class="given-names" itemprop="givenName">AL</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010">The SciELO open access: a gold way from the south</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Canadian Journal of Higher Education</span></span> <b itemprop="volumeNumber">39</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">111</span>-<span class="lpage" itemprop="pageEnd">126</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-40">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">PLOS</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication">Reviewer guidelines: criteria for publication</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-41">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Schiermeier</span> <span class="given-names" itemprop="givenName">Q</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mega</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1038%2Fnature.2016.21223">Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Nature News</span></span> <b itemprop="volumeNumber">541</b></span>(<span itemprop="issueNumber">7635</span>)</span>:<span class="fpage" itemprop="pageStart">13</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-42">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Smith</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Haustein</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fei</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ridde</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>.</b></span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=">Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Health Research Policy and System</span></span> <span class="comment">In Press</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-43">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">SPARC Europe</span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/">The open access citation advantage: list of studies until 2015</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-44">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Suber</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://dash.harvard.edu/handle/1/4322580">Gratis and libre open access</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">SPARC Open Access Newsletter, 124</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-45">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D">The open access citation advantage</a>.</cite> <span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-08-02">2 August 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-46">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">JP</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Waldner</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jacques</span> <span class="given-names" itemprop="givenName">DC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Masuzzo</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Collister</span> <span class="given-names" itemprop="givenName">LB</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hartgerink</span> <span class="given-names" itemprop="givenName">CH</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.8460.3">The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000 Research</span></span> <b itemprop="volumeNumber">5</b></span> <span class="comment">Article 632</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-47">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Universitat Konstanz</span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/">Teurer als die Wissenschaft erlaubt</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-48">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Université de Montréal</span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm">UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-49">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Wagner</span> <span class="given-names" itemprop="givenName">AB</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.5062%2FF4Q81B0W">Open access citation advantage: an annotated bibliography</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Issues in Science and Technology Librarianship</span></span> <b itemprop="volumeNumber">60</b></span>:<span class="fpage" itemprop="pageStart">2</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-50">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walker</span> <span class="given-names" itemprop="givenName">TJ</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soichi</span> <span class="given-names" itemprop="givenName">transl. T</span></span>.</b> <b class="year" itemprop="datePublished">1998</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1241%2Fjohokanri.41.678">Free internet access to traditional journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Information Processing and Management</span></span> <b itemprop="volumeNumber">41</b></span>(<span itemprop="issueNumber">9</span>)</span>:<span class="fpage" itemprop="pageStart">678</span>-<span class="lpage" itemprop="pageEnd">694</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-51">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2003</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003">The nine flavours of open access scholarly publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Postgraduate Medicine</span></span> <b itemprop="volumeNumber">49</b></span>:<span class="fpage" itemprop="pageStart">263</span>-<span class="lpage" itemprop="pageEnd">267</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Book" id="ref-52">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite class="article-title"></cite> <span itemprop="name"><a class="source" target="_blank" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=&amp;publication_year=2009">The access principle: the case for open access to research and scholarship</a></span><span> (<span class="edition">1 edition</span>). Cambridge: <span class="publisher">MIT Press</span>. </span>
+</div></li>
+</ul></section>
+ </footer></article>
+ </div>
+
+
+ <div id="related-research"></div>
+
+ <!-- annotations -->
+ <ul class="nav nav-tabs annotation-tabs-nav">
+ <li class="active"><a href="#questions" data-toggle="tab"><i class="icon-comments"></i> Questions
+ <span class="annotation-counter annotation-counter-questioning"></span></a></li>
+ <li><a href="#links" data-toggle="tab"><i class="icon-link"></i> Links
+ <span class="annotation-counter annotation-counter-linking"></span></a></li>
+ </ul>
+
+ <div class="tab-content annotation-tab-content">
+ <div class="tab-pane active" id="questions">
+ <div class="annotations" id="questions" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/questions/index.html?target=articles/4375&amp;_sort=score">Questions</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-question"
+ data-toggle="annotation-form"
+ data-target="#annotation-question-create-container"
+ rel="nofollow"
+ href="/questions.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Ask a question</a>
+ <div class="help-block annotation-learn-more"><a href="/about/FAQ/academic-contribution/" target="_blank">Learn more about Q&amp;A</a></div>
+ <div class="annotation-form-container"
+ id="annotation-question-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+
+ <div class="tab-pane" id="links">
+ <div class="annotations" id="links" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/links/index.html?target=articles/4375&amp;_sort=score">Links</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-link"
+ data-toggle="annotation-form"
+ data-target="#annotation-link-create-container"
+ rel="nofollow"
+ href="/links.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Add a link</a>
+ <div class="annotation-form-container"
+ id="annotation-link-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+
+ <div class="hidden-desktop" id="mobile-featured-jobs"></div>
+ </div>
+
+ <!-- Right sidebar -->
+ <div class="span3 offset1 article-sidebar visible-desktop">
+ <div id="article-sidebar-main-content" data-todo-href="/todos/19698/">
+ <div class="dimensions-stats-container">
+ <span class="__dimensions_badge_embed__" data-doi="10.7717/peerj.4375" data-hide-zero-citations="true" data-legend="always" data-style="small_circle"></span>
+ </div>
+
+
+ <div class="row-fluid item-action-buttons article-sidebar-item">
+ <div class="span12">
+ <a href="/benefits/" class="author-quote article-author-quote-link">
+ <div class="author-quote-text">
+ <span class="lead-in">I published in PeerJ</span> and it is very fast, has good editors, has consistently given good quality and rigorous reviews of my work, and produces visually appealing manuscripts.</div>
+ <div class="author-quote-details">
+ <span class="author-quote-name">Matthew Jackson</span><br>
+ PeerJ author
+ </div>
+</a> <div class="article-free-publishing-cta">
+ <div class="article-free-publishing-cta-title">Publish Free in 2020</div>
+ <div class="article-free-publishing-cta-subline">In PeerJ Chemistry Journals</div>
+ <a href="https://peerj.com/blog/post/115284881305/free-open-access-publishing-for-chemistry-and-computer-science-subject-areas" class="btn btn-article article-free-publishing-cta-btn">
+ Learn more
+ </a>
+ </div>
+ <div id="download-modal-trigger" class="js-download-modal-trigger btn btn-article btn-download btn-success mb-3 ">
+ Download
+</div> <!--<div class="content-cta-intro-text">Want alerts from articles like this?</div>-->
+<div id="content-alert-link" class="content-alert-link-btn" data-href="/content-alert/?aid=19698">
+ <div id="content-alert-button-label">
+ <i class="icon-envelope btn-content-alert-icon"></i>
+ Content <div class="content-alert-btn-lastword">Alert</div>
+ </div>
+ <div id="content-alert-button-loading" style="display:none;"><i class="icon-spin icon-spinner"></i> Loading...</div>
+</div>
+ <div class="content-cta-help-text">
+ Just enter your email
+ </div>
+ </div>
+ </div>
+
+
+
+
+ <nav class="article-sidebar-block">
+ <div class="sidebar-heading">
+ <i class="icon-wrench"></i> Tools & info
+ </div>
+ <ul class="nav nav-list article-item-metrics-counts" data-src="/articles/4375/counter/">
+ <li>
+ <a href="/articles/4375/reviews/"
+ rel="version-history">Peer Review history</a>
+ </li>
+
+
+ <li><a href="/articles/4375/citations/" data-toggle="modal" data-target="#citing-modal">See citing articles <span class="metric-counter citation-item-count">203</span></a></li>
+
+
+ <li><a href="#questions">Ask questions
+ <span class="metric-counter annotation-counter-questioning"></span></a></li>
+
+ <li><a href="#links">Add links
+ <span class="metric-counter annotation-counter-linking"></span></a></li>
+
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Visitors <span class="metric-counter" data-count="visitors">&nbsp;</span> <span class="pull-right metric-counter-details-cta">click for details</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Views <span class="metric-counter" data-count="views-html">&nbsp;</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Downloads <span class="metric-counter" data-count="views-pdf">&nbsp;</span></a></li>
+
+ <li><a id="item-flag-button" data-toggle="modal" href="#flagModal">Report problem with article</a></li>
+ </ul>
+ </nav>
+
+
+ <div id="related-research-sidebar"></div>
+
+</div>
+<nav class="article-sidebar-block follow" >
+ <div class="sidebar-heading">
+ <i class="icon-list-ul"></i> Outline
+ </div>
+ <div class="article-navigation"></div>
+ <div id="top-return" class="top-return">
+ <i class="icon-arrow-up"></i> Return to top
+ </div>
+
+ <div data-clone="#expertrxiv-related" data-source="/expertrxiv/related/?subjectIds=85%2C87%2C111&amp;subjects=Legal%20Issues%2C%20Science%20Policy%2C%20Data%20Science"></div>
+
+ </nav>
+
+<div class="subjects-navigation"></div>
+
+ <div id="article-identifiers">
+ <span class="article-meta-name">PubMed</span>
+ <a href="https://www.ncbi.nlm.nih.gov/pubmed/29456894"
+ id="article-identifier-pmid" target="_blank">29456894</a>
+ </div>
+ </div>
+ </div>
+
+
+<style>
+ .modal-loading-container{
+ display:flex;
+ justify-content:center;
+ color:#999;
+ padding:3rem;
+ }
+</style>
+
+<div id="download-article-modal" class="modal hide fade peer-review-article" style="">
+
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Download article</h3>
+ </div>
+
+ <div class="modal-body">
+ <div id="download-article-modal-loading" class="modal-loading-container" style="display:none;">
+ <i class="icon-spin icon-3x icon-spinner"></i>
+ </div>
+ <div id="download-article-modal-body">
+ <div id="download-modal-buttons-container">
+ <div class="download-modal-article-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</div>
+ <div class="mt-2 download-buttons">
+ <a target="_blank" download data-format="PDF" data-download-confirm-text="PDF downloaded" href="https://peerj.com/articles/4375.pdf" target="_blank" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> PDF (2.3MB)</a>
+ <a target="_blank" data-download-confirm-text="Mendeley opened" href="http://www.mendeley.com/import/?doi=10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Save to Mendeley</a>
+ <a target="_blank" data-download-confirm-text="Readcube article opened" href="http://www.readcube.com/articles/10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Read in ReadCube</a>
+ <a target="_blank" data-format="RIS" data-download-confirm-text="RIS file downloaded" href="https://peerj.com/articles/4375.ris" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> RIS</a>
+ <a target="_blank" data-format="XML" data-download-confirm-text="XML file downloaded" href="https://peerj.com/articles/4375.xml" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> XML</a>
+ <a target="_blank" data-format="BibText" data-download-confirm-text="BibText file downloaded" href="https://peerj.com/articles/4375.bib" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> BibTeX</a>
+
+ </div>
+ </div>
+
+ <div id="download-modal-downloading-message" style="display:none;">
+ <div class="text-center pt-4 pb-4">
+ <div>
+ <strong>Your download will start in a moment...</strong>
+ </div>
+ <div class="btn btn-secondary mt-4 js-close-download-modal">Close</div>
+ </div>
+ </div>
+
+ <div id="download-modal-signup-container" style="display:none;">
+
+<div class="download-modal-cta-container">
+
+ <div class="download-modal-confirm">
+ <div class="download-modal-confirm-title">
+ <i class="icon-tickcircle downloaded-tick"></i> <span class="download-modal-confirm-title-text"></span>
+ <i class="icon-chevron-down show-download-link"></i>
+ </div>
+ <a class="article-modal-download-url" href=""></a>
+ </div>
+
+
+ <div class="download-modal-cta-subtitle-small mt-2 mb-4 text-center">
+ Subscribe for subject updates
+ </div>
+
+ <div class="section-subscribe-container mb-2" style="display: flex;justify-content:center;">
+ <div>
+ <input type="text" placeholder="Email address" name="email" value="" class="form-control" id="download-subscribe-email">
+ </div>
+ <div class="ml-1">
+ <select name="freq" class="form-control" style="width: 100%;" id="download-subscribe-freq">
+ <option value="daily">Daily</option>
+ <option value="weekly">Weekly</option>
+ </select>
+ </div>
+ </div>
+
+ <div id="download-subscribe-error-container" class="mb-2 text-center text-error" style="display:none;"></div>
+
+
+ <button class="btn btn-primary btn-block btn-large mb-2 btn-modal-cta"
+ style="display: block;"
+ id="download-subscribe-submit"
+ data-url="/content-alert/download-subscribe?aid=19698"
+ data-signed-in=""
+ data-section-name="">
+ Subscribe
+ </button>
+
+ <a href="#" class="btn btn-block btn-link btn-large btn-modal-close js-close-download-modal mb-2">
+ Close
+ </a>
+
+</div>
+
+<script>
+ (function(){
+ $('#download-subscribe-submit').click(function(){
+
+ var button = $(this);
+ var url = button.data('url');
+ if(button.attr('disabled')) return;
+
+ $.get(url, function(response){
+
+ if(!response.token){
+ errorContainer.html('Server error, you have not been subscribed').show();
+ button.html('Subscribe').removeAttr('disabled');
+ return;
+ }
+
+ var errorContainer = $('#download-subscribe-error-container');
+ errorContainer.html('').hide();
+ button.html('<i class="icon-spin icon-spinner"></i>').attr('disabled', true);
+
+ var signedIn = button.data('signed-in');
+ var sectionName = button.data('section-name');
+ var data = {
+ _token: response.token
+ };
+
+ if(!signedIn) {
+ var email = $('#download-subscribe-email').val();
+ data.email = email;
+ data.freq = $('download-subscribe-freq').val();
+ }
+
+ $.ajax({
+ url: url,
+ method: 'POST',
+ data: data
+ }).success(function(response){
+ button.hide();
+ $('.js-close-download-modal').trigger('click');
+
+ PeerJ.Tools.ToastNotifications.add({
+ type: 'success',
+ title: 'Subscribed',
+ text: sectionName ? 'You subscribed to ' + sectionName : 'You subscribed to this article\'s subjects'
+ });
+
+ }).error(function(response){
+ if(response.responseJSON && response.responseJSON.errors){
+ errorContainer.html(response.responseJSON.errors[0]).show();
+ }
+ }).complete(function(){
+ button.html('Subscribe').removeAttr('disabled');
+ });
+
+ });
+ });
+
+ }());
+</script>
+ </div>
+ </div>
+ </div>
+
+ <div class="modal-footer" style="display:none;">
+ <div class="pull-right">
+ </div>
+
+ <span class="submit-copy submit-copy-btn btn cancel pull-left" id="modal-cancel" data-dismiss="modal">
+ Cancel
+ </span>
+ </div>
+</div>
+
+ <div id="ajax-form"></div>
+
+ <!-- Flag Modal -->
+ <div id="flagModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align: center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim">Report a problem</h3>
+ </div>
+
+ <form id="article-flag-form"
+ data-href="/issues/4375/flag/"
+ method="post">
+
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="alert alert-info">
+ <p><strong>Common use cases</strong><br>
+ Typos, corrections needed, missing information, abuse, etc
+ </p>
+
+ <p><strong>Our promise</strong><br>
+ PeerJ promises to address all issues as quickly and professionally as possible. We
+ thank you in advance for your patience and understanding.
+ </p>
+ </div>
+
+ <div id="flag-modal-result" style="margin-left:45px;">
+
+ <div>
+ <label><strong>Type of problem</strong></label>
+ <p>
+ <select id="moderation_flag_category" name="moderation_flag[category]" class="span4"><option value="typo">Typo</option><option value="metadata">Missing or incorrect metadata</option><option value="quality">Quality: PDF, figure, table, or data quality</option><option value="download">Download issues</option><option value="abuse">Abusive behavior</option><option value="misconduct">Research misconduct</option><option value="other">Other issue not listed above</option></select>
+
+ </p>
+ </div>
+ <div>
+ <label><strong>Details</strong> <i class="icon-large icon-question-sign" title="Please be as detailed as possible within the 500 character limit. Any details you provide will not be shown publicly." data-toggle="tooltip"></i></label>
+ <div>
+ <textarea id="moderation_flag_detail" name="moderation_flag[detail]" required="required" maxlength="500" class="span4" placeholder="Enter any details about this issue. Kept confidential with PeerJ staff." rows="5" data-counter-target="#flag-counter"></textarea>
+
+ <div style="margin:10px 0 0 0; color:#777777; float: left; display: block"><span id="flag-counter" class="label">500</span> characters remaining</div>
+ </div>
+ </div>
+
+ </div>
+
+ </div>
+ </form>
+ <div id="flag-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Cancel</button>
+ <input type="submit" class="btn btn-success save-flag-btn" value="Send report">
+ </div>
+</div>
+
+ <!-- Follow Publication Modal -->
+ <div id="followModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align:center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim" id="followModalLabel">Follow this publication for updates</h3>
+ </div>
+
+ <div>
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="row-fluid" style="margin-bottom: 15px">
+ <div class="span1">
+ <i class="icon-large icon-bullhorn"></i>
+ </div>
+ <div class="span11">
+ "Following" is like subscribing to any updates related to a publication.
+ These updates will appear in your home dashboard each time you visit PeerJ.
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span1">
+ <i class="icon-large icon-envelope"></i>
+ </div>
+ <div class="span11">
+ <p>
+ You can also choose to receive updates via daily or weekly email digests.
+ If you are following multiple publications then we will send you
+ no more than one email per day or week based on your preferences.
+ </p>
+ <p>
+ <em>Note: You are now also subscribed to the subject areas of this publication</em>
+ and will receive updates in the daily or weekly email digests if turned on.
+ You can <a href="/settings/details/">add specific subject areas</a> through your profile settings.
+ </p>
+ </div>
+ </div>
+
+ <hr>
+ <div id="follow-modal-result" style="margin-left:-40px;padding-top:7px;">
+ </div>
+
+ </div>
+
+ </div>
+
+ <div id="follow-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+ </div>
+
+ <!-- Unfollow Publication Modal -->
+ <div id="unfollowModal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Change notification settings or unfollow</h3>
+ </div>
+
+ <form id="article-unfollow-form"
+ data-href="/follow/publication/4375/1/"
+ method="put" class="form-horizontal">
+
+
+ <div id="unfollow-form-load-result" class="modal-body" data-href="/follow/publication/4375/edit/" style="max-height:350px;overflow-y:auto">
+ <p>Loading ...</p>
+ </div>
+
+ </form>
+ <div class="modal-footer">
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ <input type="submit" class="btn btn-success update-follow-btn" value="Update">
+ </div>
+</div>
+
+ <!-- Metrics Modal -->
+ <div id="metricsModal" class="modal hide">
+ <div class="modal-body" style="max-height:330px;overflow-y:auto">
+
+ <div class="row-fluid">
+ <div class="span12">
+ <p class="leadh2">Usage since published - updated daily</p>
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span8">
+ <h3 style="margin-bottom:10px">Social referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Twitter</div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Facebook</div>
+ <div class="span3" style="text-align:right;min-height:0">676</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Reddit</div>
+ <div class="span3" style="text-align:right;min-height:0">15</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">LinkedIn</div>
+ <div class="span3" style="text-align:right;min-height:0">11</div>
+ </div>
+
+ <h3 style="margin:30px 0 10px 0">Top referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From bookmark or typed URL
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">30,876</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Google search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">5,439</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Twitter
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From PeerJ Content Alert Emails
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">32</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Yahoo search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">20</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Webmail
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">3</div>
+ </div>
+ </div>
+
+ <div class="span4" style="overflow-x:hidden;">
+ <h3 style="margin-bottom:10px">Share this publication</h3>
+
+
+
+ <ul class="unstyled">
+ <li>
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+</ul>
+ <h3 style="margin-bottom:10px;margin-top:10px">Metrics</h3>
+
+ <!-- Altmetric -->
+ <div class="altmetric-embed" data-badge-popover="right"
+ data-link-target="_blank" data-doi="10.7717/peerj.4375"></div>
+ </div>
+ </div>
+
+ </div>
+
+ <div class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Wiki Modal -->
+
+ <!-- Links Modal -->
+ <div class="modal hide fade" id="article-links-modal">
+ <div class="modal-header">
+ <a rel="nofollow" data-dismiss="modal" aria-hidden="true" class="close">&times;</a>
+
+ <h3 class="modal-title">Links</h3>
+ </div>
+
+ <div class="modal-body"></div>
+
+ <div class="modal-footer">
+ <a rel="nofollow" href="/links.form?target=articles/4375" class="btn btn-primary">Add a link</a>
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Citing Modal -->
+ <div id="citing-modal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h2 class="slim"><i class="icon-copy"></i> Articles citing this paper</h2>
+ </div>
+ <div class="modal-body">Loading citing articles… <i class="icon icon-spinner icon-spin"></i></div>
+</div>
+
+ <!-- Graphical abstract modal -->
+
+ </div>
+
+
+ <div id="push"></div>
+ </div>
+
+ <footer id="footer">
+ <div class="foot">
+ <div class="container">
+
+ <div class="row">
+ <div class="span7">
+ <b>About us -</b> <a href="/about/" class="aboutLink" data-target="team">PeerJ team</a>
+ | <a href="/about/publications/" class="aboutLink" data-target="journals">Our publications</a> |
+ <a href="/benefits/">Benefits</a> | <a
+ href="/about/partnerships/" class="aboutLink" data-target="partnership">Partnerships</a> | <a
+ href="/about/endorsements/" class="aboutLink" data-target="endorsements">Endorsements</a>
+ <i class="icon-trophy"></i> <a href="/about/reviews/" class="aboutLink" data-target="reviews">Awards</a>
+ </div>
+ <div class="span5">
+ <b>Resources -</b> <a href="/about/FAQ/">FAQ</a> | <a
+ href="/about/careers/">Careers</a> | <a href="/about/press/">Press
+ room</a> | <a href="/about/terms/">Terms of use</a> | <a
+ href="/about/privacy/">Privacy</a> | <a
+ href="/about/contact/" class="aboutLink" data-target="contact">Contact</a>
+ </div>
+ <div class="span7">
+ <b>Academic boards -</b> <a href="/academic-boards/advisors/">Advisors</a> | <a
+ href="/academic-boards/editors/">Editors</a> |
+ <a href="/academic-boards/subjects/">Subject areas</a>
+ </div>
+ <div class="span5">
+ <b>Follow us -</b>
+ <a href="https://peerj.com/blog/">PeerJ blog</a> |
+ <a href="http://twitter.com/thePeerJ/" title="Follow on Twitter" data-toggle="tooltip">Twitter</a>
+ |
+ <a href="http://facebook.com/thePeerJ/" title="Follow on Facebook" data-toggle="tooltip">Facebook</a>
+ |
+ <a href="http://www.linkedin.com/company/peerj" title="Follow on LinkedIn" data-toggle="tooltip">LinkedIn</a>
+ |
+ <a href="https://www.instagram.com/thepeerj" title="Follow on Instagram" data-toggle="tooltip">Instagram</a>
+ |
+ <a href="http://www.pinterest.com/thepeerj/boards/" title="Follow on Pinterest" data-toggle="tooltip">Pinterest</a>
+ </div>
+ <div class="span7">
+ <b>Submission guides -</b>
+ <a href="/about/aims-and-scope"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/about/aims-and-scope/cs"><em>PeerJ Computer Science</em></a> |
+ <a href="/about/aims-and-scope/chemistry"><em>PeerJ Chemistry</em></a>
+ </div>
+ <div class="span5">
+ <b>Spread the word</b> -
+ <a href="/spread-the-word/activities/">Activities</a> |
+ <a href="/spread-the-word/resources/">Resources</a>
+ </div>
+ <div class="span7">&nbsp;</div>
+ <div class="span5">
+ <b>PeerJ feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom" rel="alternate" title="Articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1">RSS 1.0</a> |
+ <a href="/articles/index.rss2">RSS 2.0</a> |
+ <a href="/articles/index.json">JSON</a>
+ <br>
+
+ <b>PeerJ Computer Science feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom?journal=cs" rel="alternate" title="PeerJ Computer Science articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1?journal=cs">RSS 1.0</a> |
+ <a href="/articles/index.rss2?journal=cs">RSS 2.0</a> |
+ <a href="/articles/index.json?journal=cs">JSON</a>
+ <br>
+ <b>Archives - </b>
+ <a href="/archives/" rel="archives"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/archives/?journal=cs" rel="archives"><em>PeerJ Computer Science</em></a>
+ </div>
+
+</div>
+
+<div id="fb-root"></div>
+
+ <div class="row" style="margin-top:10px;font-size:12px">
+ <div class="span12" style="color:#888">
+
+ <div>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ</span> ISSN: 2167-8359</span>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ Comput. Sci.</span> ISSN: 2376-5992</span>
+ <span><span style="font-style:italic">PeerJ Preprints</span> ISSN: 2167-9843</span>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+ </footer>
+
+ <div id="alerts" data-async-alerts="/alerts/"></div>
+
+ <script src="/js/8d39319-35fca22.js"></script>
+ <script src="https://cdn.peerj.com/webpack/runtime.bfc7ab93.js"></script><script src="https://cdn.peerj.com/webpack/0.7880a6b6.js"></script><script src="https://cdn.peerj.com/webpack/1.24ea793f.js"></script><script src="https://cdn.peerj.com/webpack/vue-bundle.9bf24d69.js"></script>
+
+
+ <script src="/js/5d3c493-193ec0b.js"></script>
+
+ <script src="/js/c1dacd9-f146d62.js"></script>
+ <!--[if gt IE 8]><!-->
+ <script src="/assets/js/highlight/highlight.pack.js"></script>
+
+ <script>
+ $(function () {
+ // syntax highlighting for code blocks
+ $("pre > code").each(function() {
+ var node = $(this);
+
+ var language;
+
+ // JATS >=1.1
+ language = node.data('jats-language');
+
+ if (!language) {
+ // JATS <1.1
+ language = node.data('jats-preformat-type');
+
+ // ignore default 'code' type
+ if (language === 'code') {
+ language = null;
+ }
+ }
+
+ if (language) {
+ node.addClass('language-' + language);
+ }
+
+ hljs.highlightBlock(this);
+ });
+ });
+ </script>
+ <!--<![endif]-->
+
+ <script>
+ //initialise the follow button
+ $(function() {
+ PeerJ.Event.Follow.init();
+ });
+
+ //Show citations modal if query param exists
+ var urlParams = new URLSearchParams(window.location.search);
+ if(urlParams.has('citations')){
+ $('#citing-modal').modal('show');
+ }
+
+ </script>
+
+
+<script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ messageStyle: "none",
+ imageFont: null,
+ "CommonHTML": {
+ linebreaks: { automatic: true },
+ scale: 95
+ },
+ "HTML-CSS": {
+ linebreaks: { automatic: true },
+ scale: 90
+ },
+ menuSettings: {
+ zoom: "Click"
+ }
+ });
+
+ MathJax.Ajax.config.root = "/bundles/peerjmathjax/MathJax/";
+</script>
+
+<script src="/bundles/peerjmathjax/MathJax/MathJax.js?config=TeX-MML-AM_HTMLorMML,Safe&noContrib"></script>
+
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research"}'></script>
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research-sidebar"}'></script>
+ <script async src="https://badge.dimensions.ai/badge.js" charset="utf-8"></script>
+
+ <div id="content-alert-container"></div>
+
+ <div id="toast-container"></div>
+
+ <div id="vue-notifications"></div>
+
+ <div id="vue-confirm-modal"></div>
+
+ <script>
+ $(PeerJ.Home.Banner.init);
+ </script>
+
+ </body>
+</html>
diff --git a/python/tests/files/scielo_article.jats.xml b/python/tests/files/scielo_article.jats.xml
new file mode 100644
index 0000000..08c864e
--- /dev/null
+++ b/python/tests/files/scielo_article.jats.xml
@@ -0,0 +1,336 @@
+<?xml version="1.0" encoding="ISO-8859-1"?><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<front>
+<journal-meta>
+<journal-id>1683-9803</journal-id>
+<journal-title><![CDATA[Pediatría (Asunción)]]></journal-title>
+<abbrev-journal-title><![CDATA[Pediatr. (Asunción)]]></abbrev-journal-title>
+<issn>1683-9803</issn>
+<publisher>
+<publisher-name><![CDATA[Sociedad Paraguaya de Pediatría]]></publisher-name>
+</publisher>
+</journal-meta>
+<article-meta>
+<article-id>S1683-98032015000200002</article-id>
+<article-id pub-id-type="doi">10.18004/ped.2015.agosto.102-107</article-id>
+<title-group>
+<article-title xml:lang="es"><![CDATA[Prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años en las comunidades indígenas de Yby Yau y Azote’y, 2011]]></article-title>
+<article-title xml:lang="en"><![CDATA[Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011]]></article-title>
+</title-group>
+<contrib-group>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Valiente]]></surname>
+<given-names><![CDATA[Syntia Carolina]]></given-names>
+</name>
+<xref ref-type="aff" rid="A01"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Cañete]]></surname>
+<given-names><![CDATA[Manuel]]></given-names>
+</name>
+<xref ref-type="aff" rid="A02"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Cohene Velazquez]]></surname>
+<given-names><![CDATA[Bartola]]></given-names>
+</name>
+<xref ref-type="aff" rid="A03"/>
+</contrib>
+</contrib-group>
+<aff id="A01">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A02">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A03">
+<institution><![CDATA[,Puesto de Salud de Paso Tuya. Azote’y. Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<pub-date pub-type="pub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<pub-date pub-type="epub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<volume>42</volume>
+<numero>2</numero>
+<fpage>102</fpage>
+<lpage>107</lpage>
+<copyright-statement/>
+<copyright-year/>
+<self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_arttext&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_abstract&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_pdf&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><abstract abstract-type="short" xml:lang="es"><p><![CDATA[Introducción: La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrición. La desnutrición infantil no es solo un problema de falta de alimentos, es un conflicto social más profundo. La prevalencia de desnutrición en menores de 5 años del país es de 5,9% según datos del Instituto Nacional de Alimentación y Nutrición. Objetivo: Determinar la prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años de las comunidades indígenas de Yby Yaú y Azote’y. Materiales y Métodos: Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identificó la prevalencia de desnutrición infantil en niños indígenas de las etnias Pa'i Tavyterã y Mbya Guaraní de 11 comunidades indígenas de Yby Yau y Azote’y. Fueron examinados 349 menores de 5 años de edad. Para la evaluación del estado nutricional se utilizó la curva de crecimiento de la OMS. Los niños/as fueron pesados/as en balanzas mecánicas. Para la medida de la altura, los mayores de dos años fueron medidos con el tallimetro y los menores de 2 años con cinta métrica. Resultados: Se observó desnutrición en 53 niños que equivale al 15% de la muestra. De estos 60,4% padecían de desnutrición moderada y 39,6% desnutrición grave. El mayor porcentaje de desnutrición se encontró en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los niños tenían desnutrición crónica. Conclusiones: La prevalencia de desnutrición en indígenas en Yby Yaú y Azote’y es de 15%, lo que sobrepasa los índices de desnutrición en menores de 5 años del país.]]></p></abstract>
+<abstract abstract-type="short" xml:lang="en"><p><![CDATA[Introduction: Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. Objective: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Yaú Yby. Materials and Methods: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyterá and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. Results: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. Conclusions: The prevalence of malnutrition in indigenous children in Yby Yaú and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.]]></p></abstract>
+<kwd-group>
+<kwd lng="es"><![CDATA[Desnutrición aguda]]></kwd>
+<kwd lng="es"><![CDATA[desnutrición crónica]]></kwd>
+<kwd lng="es"><![CDATA[indígenas]]></kwd>
+<kwd lng="en"><![CDATA[Acute malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[chronic malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[indigenous]]></kwd>
+</kwd-group>
+</article-meta>
+</front><body><![CDATA[ <p align="right"><font size="3" face="Verdana"><b>ART&Iacute;CULO ORIGINAL</b></font></p> <p align="left">&nbsp;</p> <p align="left"><font size="4" face="Verdana"><b>Prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en&nbsp; ni&ntilde;os menores de 5 a&ntilde;os en las comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y, 2011</b></font></p> <p align="left"><font size="3" face="Verdana"><b><i>Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011</i></b></font></p> <p align="center">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>Syntia Carolina Ruiz Valiente<sup>(1)</sup>, Manuel Ruiz Ca&ntilde;ete<sup>(2)</sup>, Bartola Cohene Velazquez<sup>(3)</sup></b></font></p> <p align="left"> <font size="2" face="Verdana">1. Hospital General Pedi&aacute;trico Ni&ntilde;os Acosta &Ntilde;u. Reducto-San Lorenzo, Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">2. Centro de Salud de Yby Yau. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">3. Puesto de Salud de Paso Tuya. Azote&rsquo;y. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana"><b>Correspondencia</b>: Syntia Carolina Ruiz Valiente. E-mail: scrv_py@hotmail.com</font></p> ]]></body>
+<body><![CDATA[<p align="left"> <font size="2" face="Verdana">Recibido: 24/01/2015; Aceptado: 10/06/2015.</font></p> <p align="left"> <font size="2" face="Verdana"><i>Los autores declaran que no existen conflictos de inter&eacute;s en el presente estudio.</i></font></p> <p align="left">&nbsp;</p> <hr size="1" noshade> <p align="left"><font size="2" face="Verdana"><b>RESUMEN</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introducci&oacute;n: </b>La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrici&oacute;n. La desnutrici&oacute;n infantil no es solo un problema de falta de alimentos, es un conflicto social m&aacute;s profundo. La prevalencia de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s es de 5,9% seg&uacute;n datos del Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n. <b>Objetivo</b>: Determinar la prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby Ya&uacute; y Azote&rsquo;y. <b>Materiales y M&eacute;todos:</b> Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa'i Tavyter&atilde; y Mbya Guaran&iacute; de 11 comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y. Fueron examinados 349 menores de 5 a&ntilde;os de edad. Para la evaluaci&oacute;n del estado nutricional se utiliz&oacute; la curva de crecimiento de la OMS. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas. Para la medida de la altura, los mayores de dos a&ntilde;os fueron medidos con el tallimetro y los menores de 2 a&ntilde;os con cinta m&eacute;trica. <b>Resultados:</b> Se observ&oacute; desnutrici&oacute;n en 53 ni&ntilde;os que equivale al 15% de la muestra. De estos 60,4% padec&iacute;an de desnutrici&oacute;n moderada y 39,6% desnutrici&oacute;n grave. El mayor porcentaje de desnutrici&oacute;n se encontr&oacute; en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los ni&ntilde;os ten&iacute;an desnutrici&oacute;n cr&oacute;nica. <b>Conclusiones:</b> La prevalencia de desnutrici&oacute;n en ind&iacute;genas en Yby Ya&uacute; y Azote&rsquo;y es de 15%, lo que sobrepasa los &iacute;ndices de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s.</font></p> <p align="left"><font size="2" face="Verdana"><b>Palabras clave:</b> Desnutrici&oacute;n aguda, desnutrici&oacute;n cr&oacute;nica, ind&iacute;genas.</font></p> <p align="left">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>ABSTRACT</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introduction:</b> Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age&nbsp; in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. <b>Objective</b>: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Ya&uacute; Yby. <b>Materials and Methods</b>: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyter&aacute; and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. <b>Results</b>: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. <b>Conclusions</b>: The prevalence of malnutrition in indigenous children in Yby Ya&uacute; and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.</font></p> <p align="left"><font size="2" face="Verdana"><b>Keywords</b>: Acute malnutrition, chronic malnutrition, indigenous.</font></p> <hr size="1" noshade> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>INTRODUCCI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">La desnutrici&oacute;n es una enfermedad multisist&eacute;mica, que afecta todos los &oacute;rganos y sistemas del ser humano, es producida por una disminuci&oacute;n dr&aacute;stica, aguda o cr&oacute;nica, en la disponibilidad de nutrimentos, ya sea por ingesti&oacute;n insuficiente, inadecuada absorci&oacute;n, exceso de p&eacute;rdidas o la conjunci&oacute;n de dos o m&aacute;s de estos factores. Se manifiesta por grados de d&eacute;ficit antropom&eacute;trico, signos y s&iacute;ntomas cl&iacute;nicos y alteraciones bioqu&iacute;micas, hematol&oacute;gicas e inmunol&oacute;gicas (1).</font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n ind&iacute;gena est&aacute; gravemente afectada por este problema, tal vez por ser un estrato olvidado y descuidado por la poblaci&oacute;n en general y por el estado paraguayo. A pesar de las leyes, y de todos los proyectos que favorecen a esta esfera de la sociedad, a&uacute;n existe un abismo inimaginable entre lo ideal y lo real. Mientras se elaboran programas que buscan dar mejores condiciones de vida a estas comunidades, que la mayor&iacute;a de las veces solo quedan plasmados en el papel, los &iacute;ndices de desnutrici&oacute;n son alarmantes. Esto se debe probablemente a que en la sociedad posmoderna, la deforestaci&oacute;n, el uso de agrot&oacute;xicos, la invasi&oacute;n de los terratenientes despoj&oacute; a los nativos de sus tierras, oblig&aacute;ndolos a vivir en situaciones carenciales, pues estos debido a su cultura esperan que la naturaleza les ofrezca el sustento diario. Las costumbres, la econom&iacute;a y la religi&oacute;n en las etnias Paí Tavyter&atilde; y Mby`a Guaran&iacute; est&aacute;n &iacute;ntimamente relacionadas a la producci&oacute;n alimenticia e ingesta.</font></p> <p align="left"><font size="2" face="Verdana">Para el nativo guaran&iacute; es muy dif&iacute;cil comprender que el hombre es el que debe producir alimento para su sustento, pero como la sociedad actual obliga a ello, estos por no conseguir adaptarse a los cambios que se produjeron, est&aacute;n m&aacute;s expuestos a las carencias alimentarias. Seg&uacute;n datos del gobierno central en el 2008, 41,8% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os padec&iacute;an de desnutrici&oacute;n.</font></p> <p align="left"><font size="2" face="Verdana">En un estudio realizado en M&eacute;xico, la prevalencia de desnutrici&oacute;n en ind&iacute;genas fue 39,4%(2). Un 44% present&oacute; uno o m&aacute;s signos cl&iacute;nicos de malnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Encuestas y Censos del Ecuador (2001 y 2006) 40,1% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os tienen desnutrici&oacute;n cr&oacute;nica (3).</font></p> <p align="left"><font size="2" face="Verdana">En Caracas, se hizo un estudio con la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, y ellos obtuvieron el siguiente resultado: El diagn&oacute;stico nutricional hallado con mayor frecuencia fue Nutrici&oacute;n normal (55%) seguida por Desnutrici&oacute;n Subcl&iacute;nica (15%) y Desnutrici&oacute;n Leve (12%). En l&iacute;neas generales, un 55% de la poblaci&oacute;n se encontraba en rangos de nutrici&oacute;n normal, mientras el 45% restante presentaba problema de malnutrici&oacute;n comprendiendo &eacute;sta por d&eacute;ficit y por exceso (4).</font></p> <p align="left"><font size="2" face="Verdana">En el Brasil en un estudio realizado para determinar el perfil nutricional de los abor&iacute;genes menores de 5 a&ntilde;os de Kaing&aacute;ngen Paran&aacute; vieron que cuando utilizado los criterios propuestos por la OMS, se registr&oacute; una alta prevalencia de d&eacute;ficit Estatura/Edad, con uno en cuatro ni&ntilde;os (24,8%) que presentaba este diagn&oacute;stico. El d&eacute;ficit de Peso/Edad fue diagnosticado en 9,2% de los ni&ntilde;os evaluados. Los &iacute;ndices de peso para la altura diagnosticaron solo tres ni&ntilde;os (2,1%) como desnutridas agudas (5).</font></p> <p align="left"><font size="2" face="Verdana">En otro estudio realizado tambi&eacute;n en el Brasil, esta vez en Amazonia, con ni&ntilde;os de la etnia Suru&iacute; se observ&oacute; que los porcentajes de los ni&ntilde;os con d&eacute;ficit en los &iacute;ndices de estatura para la edad fue 31,4%, peso para la edad 12,4% y peso para la estatura 0% (6).</font></p> <p align="left"><font size="2" face="Verdana">El objetivo del presente estudio es determinar la prevalencia de desnutrici&oacute;n en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y y conocer el comportamiento alimentario de los ni&ntilde;os/as de las comunidades ind&iacute;genas estudiadas.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>MATERIALES Y M&Eacute;TODOS</b></font></p> <p align="left"><font size="2" face="Verdana">Estudio transversal, descriptivo realizado en el periodo de enero a abril del a&ntilde;o 2011, donde se identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa&#297; Tavyter&atilde; y Mby`a Guaran&iacute; en los distritos de Yby-Ya&uacute; y Azote&rsquo;y.</font></p> <p align="left"><font size="2" face="Verdana">El tama&ntilde;o muestral total fue de 370 ni&ntilde;os, determinado a trav&eacute;s de censo realizado por el Centro de Salud de Yby-Ya&uacute; y el Puesto de Salud de Paso Tuya. Para los fines del estudio fueron identificados 349 ni&ntilde;os (94.3%) de ni&ntilde;os reci&eacute;n nacidos a menores de 5 a&ntilde;os en los distritos de Yby-Ya&uacute; y Azote'y.</font></p> <p align="left"><font size="2" face="Verdana">Las etnias que se encuentran dentro del &aacute;rea de estudio est&aacute; compuesta por los mby`a guaran&iacute; y los pa&#297; tavyter&atilde;, distribuidas en las siguientes comunidades ind&iacute;genas: Vy'apav&#7869;, Yrapey, Guyrakeha, Guyra &Ntilde;e'engatuamba, Satí;, San Juan, Mbery'o Jaguarymi, Ka'aguy Poty Rory, Yvyra'ija, Tukambiju y Takuaritiy.</font></p> <p align="left"><font size="2" face="Verdana">El trabajo se realiz&oacute; por concentraci&oacute;n, en los locales fijados por los l&iacute;deres de las distintas comunidades. Fue aplicado un cuestionario a las madres, creado para el efecto por medio de entrevista. La edad de los ni&ntilde;os fue dada por las madres, pues la mayor&iacute;a de estas no cuentan con registro de nacimiento, ni siquiera certificado de nacido vivo.</font></p> <p align="left"><font size="2" face="Verdana">Para la evaluaci&oacute;n del estado nutricional de los ni&ntilde;os se opt&oacute; por la curva del gr&aacute;fico de crecimiento de la Organizaci&oacute;n Mundial de la Salud (OMS) lo cual est&aacute; contenido en la libreta del ni&ntilde;o y la ni&ntilde;a. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas, los que ya consegu&iacute;an quedarse de pie fueron pesados en balanza de pie y los ni&ntilde;os menores de 1 a&ntilde;o en balanzas colgantes.</font></p> <p align="left"><font size="2" face="Verdana">Para la medida de la altura, los ni&ntilde;os mayores de dos a&ntilde;os fueron colocados en posici&oacute;n de pie, bien rectos, y fueron medidos con el tallimetro. La talla de los ni&ntilde;os menores de 2 a&ntilde;os fue realizada con cinta m&eacute;trica con el ni&ntilde;o/a en dec&uacute;bito supino en superficie recta.</font></p> <p align="left"><font size="2" face="Verdana">Los datos fueron analizados manualmente, y los gr&aacute;ficos confeccionados con el programa Microsoft Office Excel 2007.</font></p> <p align="justify">&nbsp;</p> ]]></body>
+<body><![CDATA[<p align="left"><font size="3" face="Verdana"><b>RESULTADOS</b></font></p> <p align="left"><font size="2" face="Verdana">Se evaluaron 349 ni&ntilde;os, que representan el 94,3% del total de abor&iacute;genes menores de 5 a&ntilde;os de las comunidades de Yby-Ya&uacute; y Azote&rsquo;y. Del total de 349 ni&ntilde;os, 69 % (240) son Paí; Tavyter&atilde; y 31% (109) Mby`a Guaran&iacute;. </font></p> <p align="left"><font size="2" face="Verdana">La comunidad con el mayor porcentaje de ni&ntilde;os fue la de Vy'&atilde;pav&#7869; (36,4%), y la de menor frecuencia fue la comunidad de Tekoha Kag&atilde;t&atilde;, que es una comunidad reci&eacute;n formada localizada en Pasi&ntilde;o (<a href="#2a02f1">Figura 1</a>).</font></p> <p align="center"><a name="2a02f1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f1.jpg"></p> <p align="left"><font size="2" face="Verdana">Viendo el perfil nutricional de los ni&ntilde;os, se pudo observar que 61% de los ni&ntilde;os/as no est&aacute;n desnutridos, 24% de los ni&ntilde;os/as est&aacute;n en riesgo de desnutrici&oacute;n y 15% est&aacute;n con desnutrici&oacute;n. Aunque se trata de un estrato social desfavorecido tambi&eacute;n se observa &iacute;ndice de sobrepeso y obesidad, en las comunidades de Vy'&atilde;pav&#7869; e Yrapey (<a href="#2a02f2">Figura 2</a>).</font></p> <p align="center"><a name="2a02f2"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f2.jpg"></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">Teniendo presente los gr&aacute;ficos de Talla/Edad la prevalencia de desnutrici&oacute;n cr&oacute;nica es bastante elevada, pues 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. El mayor &iacute;ndice de desnutrici&oacute;n se encuentran en los primeros 24 meses de vida (<a href="#2a02t1">Tabla 1</a>). De los 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. Siendo que el mayor porcentaje de desnutrici&oacute;n se observa en Vy'&atilde;pav&#7869;.</font></p> <p align="center"><a name="2a02t1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02t1.jpg"></p> <p align="left"><font size="2" face="Verdana">Se estudi&oacute; adem&aacute;s el comportamiento alimentario de estos ni&ntilde;os, viendo que alimentos preferencialmente hacen parte de su dieta y la edad de introducci&oacute;n de los mismos, la mayor&iacute;a de las madres introducen alg&uacute;n tipo alimento entre los 6 y 8 meses de edad (<a href="#2a02f3">Figura 3</a>) y los primeros alimentos introducidos dependen del lugar donde estos habitan. El caldo de pescado es uno de los primeros alimentos introducidos en las comunidades que viven cerca de los r&iacute;os, entretanto el 60% inician la alimentaci&oacute;n con caldo de arroz y caldo de fideo.</font></p> <p align="center"><a name="2a02f3"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f3.jpg"></p> <p align="left"><font size="2" face="Verdana">Al observar la frecuencia en que se alimentan estos ni&ntilde;os, el 64% se alimenta tres veces al d&iacute;a, el 20% menos de 3 veces al d&iacute;a y solo el 16 % m&aacute;s de tres veces al d&iacute;a.</font></p> <p align="left"><font size="2" face="Verdana">El principal nutriente en la dieta son los carbohidratos, el 47% de los ni&ntilde;os consumen carbohidratos m&aacute;s de 5 veces por semana, y el 21% menos de 3 veces por semana. El mayor porcentaje de consumo de prote&iacute;nas se observa en las comunidades que se encuentran cerca de r&iacute;os (Guyra &Ntilde;e`engatuamba y Mbery'o Jaguarymi), siendo que 70% consume prote&iacute;nas menos de 3 veces por semana, y solo el 3% m&aacute;s de cinco veces por semana. El consumo de verduras y hortalizas es muy escaso, el 91% consume verduras y hortalizas menos de 3 veces por semana, el 2% m&aacute;s de 5 veces y 7% entre 3 y 5 veces por semana.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>DISCUSI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">A lo largo de toda la historia de la humanidad, la desnutrici&oacute;n ha sido una patolog&iacute;a de las clases sociales menos privilegiadas, son los que no poseen las condiciones necesarias para tener una vida digna, donde la educaci&oacute;n, salud, recursos econ&oacute;micos son miserables, donde esta dolencia alcanza su auge (7).</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n los datos del Censo realizado por la Unidad de Salud Ind&iacute;gena que se encuentra en el Distrito de Yby-Ya&uacute;, los Puestos de Salud de Yby- Ya&uacute; y Azote&rsquo;y en el tercer trimestre del A&ntilde;o 2010, se encontraron 328 ni&ntilde;os de hasta 60 meses (8). Al realizar los trabajos de campo, este n&uacute;mero se elev&oacute; a 349 individuos, por lo que se hizo un nuevo censo solo con los ni&ntilde;os de este grupo etario. Ese fen&oacute;meno tal vez, se deba a la migraciones que se desarrollan normalmente entre los guaran&iacute;. Al observar la historia, y tambi&eacute;n por la experiencia que se adquiri&oacute; durante el trabajo de campo, se pudo observar la familia ling&uuml;&iacute;stica a la cual pertenecen los mby`a y los paí; (la guaran&iacute;) son n&oacute;madas, es com&uacute;n que migren a otras comunidades, en un mismo Tekoha (9,10).</b></font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n diana fue de 370 ni&ntilde;os menores de 5 a&ntilde;os de los cuales se lleg&oacute; a entrevistar a las madres de 349 y se hizo las mediciones antropom&eacute;tricas posteriormente. En la mayor&iacute;a de las comunidades ind&iacute;genas se obtuvo el 100% de participaci&oacute;n, son excepciones las comunidades de Yrapey y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Del total de ni&ntilde;os/as, la etnia de mayor prevalencia fue la de Paí; Tavyter&atilde;. En relaci&oacute;n al sexo, las comunidades son bastante equilibradas, con una ligera prevalencia del sexo masculino sobre el femenino.</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n datos de la UNICEF en Paraguay se observa 3,4% de desnutrici&oacute;n aguda en ni&ntilde;os menores de 5 a&ntilde;os (11). La prevalencia de desnutrici&oacute;n en los ni&ntilde;os paraguayos menores de 5 a&ntilde;os en el &aacute;rea rural es de 5,9% y en el &aacute;rea urbana es de 4,5% (12). Existen pocas publicaciones sobre este tema en abor&iacute;genes menores de 5 a&ntilde;os, siendo que el mayor n&uacute;mero de publicaciones fue realizado por el Brasil (12,4%), M&eacute;xico (39,4%) y Ecuador.</b></font></p> <p align="left"><font size="2" face="Verdana">La prevalencia de desnutrici&oacute;n en las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y es de 15,2%, observando los gr&aacute;ficos de Peso/edad si de 2 a&ntilde;os y Peso/Talla en mayores de 2 a&ntilde;os y menores de 5 a&ntilde;os. Las comunidades donde la desnutrici&oacute;n son m&aacute;s prevalentes son Guyrakeha e Yvyra'ija; en Satí; y Tekoha Kagat&atilde; no se encontr&oacute; ni&ntilde;os desnutridos.</b></font></p> <p align="left"><font size="2" face="Verdana">De 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. El grupo con mayor &iacute;ndice de desnutrici&oacute;n, se encuentra durante los primeros 24 meses, pues es en esta etapa donde el organismo requiere una mayor cantidad de nutrientes por el mayor crecimiento. Adem&aacute;s, despu&eacute;s de los 6 meses se inicia la introducci&oacute;n de otros alimentos. Estos dos factores, asociados aumentan el &iacute;ndice de desnutrici&oacute;n en este grupo de edad.</b></font></p> <p align="left"><font size="2" face="Verdana">De la poblaci&oacute;n total de los ni&ntilde;os estudiados el 23,8% est&aacute;n con riesgo de desnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n (INAN) en el a&ntilde;o 2010, 13,6% de ni&ntilde;os menores de 5 a&ntilde;os del &aacute;rea urbana y 16,2% del &aacute;rea rural del Paraguay sufren desnutrici&oacute;n cr&oacute;nica. En una encuesta realizada por la Direcci&oacute;n General de Estad&iacute;stica, Encuestas y Censos en el a&ntilde;o 2008, 41,8% de los ni&ntilde;os/as ind&iacute;genas menores de cinco a&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Observadas las medidas de Talla/Edad el 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Ese dato es alarmante, porque la desnutrici&oacute;n cr&oacute;nica es consecuencia de una carencia prolongada de alimentos o enfermedades sucesivas. En Tukambiju, Mbery'o Jaguarymi, Guyrakeha, Yvyra'ija y Satí; son comunidades con una prevalencia mayor al 80% de ni&ntilde;os/as con talla baja para la edad.</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">El &iacute;ndice de desnutrici&oacute;n en ind&iacute;genas en los distritos de Yby-Ya&uacute; y Azote&rsquo;y, sobrepasa la prevalencia general de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s, lo cual est&aacute; alrededor de 5.9% seg&uacute;n datos del INAN.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas se puede observar que un porcentaje razonable introduce alimentos entre los 6 meses y antes de los 9 meses. El porcentaje de los que introducen antes de los 6 meses es de 18,6% y entre los 9 meses y un a&ntilde;o es de 27%. Se pudo observar que, ocho ni&ntilde;os tuvieron lactancia materna exclusiva por m&aacute;s de 1 a&ntilde;o. Todos los ni&ntilde;os/as con lactancia materna exclusiva en la fecha de la recolecci&oacute;n de datos ten&iacute;a menos de 6 meses o 6 meses. El caldo de fideo y de arroz ocupa el primer y segundo lugar respectivamente como primer alimento introducido por las madres. Los alimentos que deber&iacute;an ser introducidos inicialmente como el pur&eacute; de frutas y verduras ocupan un peque&ntilde;o porcentaje en la lista. Otros alimentos que se tendr&iacute;an que introducir despu&eacute;s de los 9 meses, de preferencia a los un a&ntilde;o, como por ejemplo el caldo de poroto, caldo de pescado, leche de vaca y huevo son los primeros alimentos que se introducen.</b></font></p> <p align="left"><font size="2" face="Verdana">El 64% de los ni&ntilde;os se alimentan tres veces al d&iacute;a, el 20,5% menos de tres veces y 15,5% m&aacute;s de tres veces al d&iacute;a.</b></font></p> <p align="left"><font size="2" face="Verdana">El 69,5% de los ni&ntilde;os/as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y consumen prote&iacute;nas menos de tres veces por semana; 27,3% consumen de tres a cinco veces por semana los diferentes tipos de prote&iacute;nas, teniendo predominancia el consumo de pez. Solo 3,2% consume prote&iacute;nas m&aacute;s de 5 veces. Las comunidades que viven cerca de bosques, r&iacute;os o arroyos son los que m&aacute;s consumen prote&iacute;nas.</b></font></p> <p align="left"><font size="2" face="Verdana">Los carbohidratos son la principal fuente de alimentaci&oacute;n de los ni&ntilde;os y ni&ntilde;as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y. Eso se debe a que son los alimentos de m&aacute;s f&aacute;cil adquisici&oacute;n y los m&aacute;s accesibles econ&oacute;micamente hablando.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas el consumo de verduras y hortalizas es escaso. Las comunidades que m&aacute;s consumen verduras y hortalizas son Mberyo Jaguarymi y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Este trabajo refleja la realidad de las comunidades ind&iacute;genas de los dos distritos observados, no podemos extrapolar estas mismas cifras en el departamento de Concepci&oacute;n, o en todo el pa&iacute;s por el tama&ntilde;o de la muestra, es necesario hacer nuevos estudios con un tama&ntilde;o muestral mayor para obtener una visi&oacute;n del verdadero estado nutricional de los ni&ntilde;os ind&iacute;genas. El porcentaje de desnutrici&oacute;n es alto, pero se trata de distritos con no muchos recursos econ&oacute;micos, donde la pobreza es una realidad a&uacute;n en otros estratos sociales.</b></font></p> <p align="left"><font size="2" face="Verdana">La realidad ind&iacute;gena es un problema real, y una manera de reducir estas cifras es ense&ntilde;&aacute;ndoles a producir su propio alimento. Para ello no debemos luchar con su cultura ni intentar hacerlos ver el mundo a trav&eacute;s de nuestra realidad, sino dentro de sus costumbres encontrar formas de que ellos tengan condiciones de un mejor porvenir.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>AGRADECIMIENTOS</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">A las comunidades ind&iacute;genas que participaron en nuestro estudio, los profesionales de blanco del Centro de Salud de Yby-Yau y Azote&rsquo;y, a la Comunidad de Hermanas de la Divina Providencia de Yby-Yau, a la Dra. Blanca Villalba y a la Dra. Gloria Mart&iacute;nez.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>REFERENCIAS</b></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">1. Monteiro CA. Fome, desnutri&ccedil;&atilde;o e pobreza: al&eacute;m da sem&acirc;ntica. Sa&uacute;de Soc. 2003;12(1):7-11. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102986&pid=S1683-9803201500020000200001&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">2. Vi&ntilde;as MR, Fr&iacute;as ML, Verd&uacute; JM. Entorno social y desnutrici&oacute;n en ni&ntilde;os de 1 a 4 a&ntilde;os de comunidades ind&iacute;genas de M&eacute;xico. Rev Esp Nutr Comunitaria. 2005;11(3):128-34. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102988&pid=S1683-9803201500020000200002&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">3. INEC. Ecuador: 40,1% de ind&iacute;genas con desnutrici&oacute;n cr&oacute;nica. Ecuador: Estudio del INEC; 2009. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102990&pid=S1683-9803201500020000200003&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">4. Chumpitaz D, Russo A, Del NogaL B, Case C, Lares M. Evaluaci&oacute;n nutricional de la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004. AVFT. 2006;25(1):26-31. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102992&pid=S1683-9803201500020000200004&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">5. Kuhl AM, Tittoni C, Leite MS, Bastos JL. Perfil Nutricional e fatores associados &agrave; ocorr&ecirc;ncia de desnutri&ccedil;&atilde;o entre crian&ccedil;as ind&iacute;genas Kaing&aacute;ng da Terra Ind&iacute;gena de Mangueirinha, Paran&aacute;, Brasil. Cad Sa&uacute;de P&uacute;blica. 2009;25(2):409-420. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102994&pid=S1683-9803201500020000200005&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">6. Orellana JD, Coimbra Jr. CE, Louren&ccedil;o AE, Santos RV. Estado nutricional e anemia en crian&ccedil;as Suru&iacute;, Amaz&ocirc;nia, Brasil. J Pediatr (Rio J). 2006;82(5):383-88. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102996&pid=S1683-9803201500020000200006&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">7. Organizaci&oacute;n de las Naciones Unidas. Foro permanente para las cuestiones ind&iacute;genas: informe sobre el quinto per&iacute;odo de sesiones (15 a 26 de mayo de 2006). Nueva York: Naciones Unidas; 2006. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102998&pid=S1683-9803201500020000200007&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">8. Centro de Salud de Yby-Yau. Censo local de las comunidades ind&iacute;genas. Yby-Yau; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103000&pid=S1683-9803201500020000200008&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">9. Chase-Sardi M, Brun A, Enciso MA. Situaci&oacute;n sociocultural, econ&oacute;mica, jur&iacute;dico-pol&iacute;tico actual de las comunidades ind&iacute;genas del Paraguay. Asunci&oacute;n: UCA; 1989. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103002&pid=S1683-9803201500020000200009&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">10. Meliá B, Grunberg G, Grunberg F. Paî -Tavyterã: etnograf&iacute;a guaran&iacute; del Paraguay contempor&aacute;neo. 2da. ed. Asunci&oacute;n: Centro de Estudios Antrop&oacute;logicos de la Universidad Cat&oacute;lica; 2008. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103004&pid=S1683-9803201500020000200010&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">11. FAO. Panorama de la seguridad alimentaria y nutricional en Am&eacute;rica Latina y el Caribe 2013. FAO; 2014. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103006&pid=S1683-9803201500020000200011&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --> </font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">12. Masi C, S&aacute;nchez Bernal S, Dallman D, Rodas A, Morinigo G, Mendoza L. Perfil nutricional de ni&ntilde;os menores de 5 a&ntilde;os que acuden a servicios p&uacute;blicos de salud en el Paraguay. Asunci&oacute;n: INAN; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103008&pid=S1683-9803201500020000200012&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> ]]></body><back>
+<ref-list>
+<ref id="B1">
+<label>1</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Monteiro]]></surname>
+<given-names><![CDATA[CA]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Fome, desnutrição e pobreza: além da semântica]]></article-title>
+<source><![CDATA[Saúde Soc]]></source>
+<year>2003</year>
+<volume>12</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>7-11</page-range></nlm-citation>
+</ref>
+<ref id="B2">
+<label>2</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Viñas]]></surname>
+<given-names><![CDATA[MR]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Frías]]></surname>
+<given-names><![CDATA[ML]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Verdú]]></surname>
+<given-names><![CDATA[JM]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="es"><![CDATA[Entorno social y desnutrición en niños de 1 a 4 años de comunidades indígenas de México]]></article-title>
+<source><![CDATA[Rev Esp Nutr Comunitaria]]></source>
+<year>2005</year>
+<volume>11</volume>
+<numero>3</numero>
+<issue>3</issue>
+<page-range>128-34</page-range></nlm-citation>
+</ref>
+<ref id="B3">
+<label>3</label><nlm-citation citation-type="book">
+<collab>INEC</collab>
+<source><![CDATA[Ecuador: 40,1% de indígenas con desnutrición crónica]]></source>
+<year>2009</year>
+<publisher-loc><![CDATA[Ecuador ]]></publisher-loc>
+<publisher-name><![CDATA[Estudio del INEC]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B4">
+<label>4</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chumpitaz]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Russo]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Del NogaL]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Case]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lares]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Evaluación nutricional de la población infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004]]></article-title>
+<source><![CDATA[AVFT]]></source>
+<year>2006</year>
+<volume>25</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>26-31</page-range></nlm-citation>
+</ref>
+<ref id="B5">
+<label>5</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Kuhl]]></surname>
+<given-names><![CDATA[AM]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Tittoni]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Leite]]></surname>
+<given-names><![CDATA[MS]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Bastos]]></surname>
+<given-names><![CDATA[JL]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Perfil Nutricional e fatores associados à ocorrência de desnutrição entre crianças indígenas Kaingáng da Terra Indígena de Mangueirinha, Paraná, Brasil]]></article-title>
+<source><![CDATA[Cad Saúde Pública]]></source>
+<year>2009</year>
+<volume>25</volume>
+<numero>2</numero>
+<issue>2</issue>
+<page-range>409-420</page-range></nlm-citation>
+</ref>
+<ref id="B6">
+<label>6</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Orellana]]></surname>
+<given-names><![CDATA[JD]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Coimbra Jr]]></surname>
+<given-names><![CDATA[CE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lourenço]]></surname>
+<given-names><![CDATA[AE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Santos]]></surname>
+<given-names><![CDATA[RV]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Estado nutricional e anemia en crianças Suruí, Amazônia, Brasil]]></article-title>
+<source><![CDATA[J Pediatr (Rio J)]]></source>
+<year>2006</year>
+<volume>82</volume>
+<numero>5</numero>
+<issue>5</issue>
+<page-range>383-88</page-range></nlm-citation>
+</ref>
+<ref id="B7">
+<label>7</label><nlm-citation citation-type="book">
+<collab>Organización de las Naciones Unidas</collab>
+<source><![CDATA[Foro permanente para las cuestiones indígenas: informe sobre el quinto período de sesiones (15 a 26 de mayo de 2006)]]></source>
+<year>2006</year>
+<publisher-loc><![CDATA[Nueva York ]]></publisher-loc>
+<publisher-name><![CDATA[Naciones Unidas]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B8">
+<label>8</label><nlm-citation citation-type="">
+<collab>Centro de Salud de Yby-Yau</collab>
+<source><![CDATA[Censo local de las comunidades indígenas]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Yby-Yau ]]></publisher-loc>
+</nlm-citation>
+</ref>
+<ref id="B9">
+<label>9</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chase-Sardi]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Brun]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Enciso]]></surname>
+<given-names><![CDATA[MA]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Situación sociocultural, económica, jurídico-político actual de las comunidades indígenas del Paraguay]]></source>
+<year>1989</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[UCA]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B10">
+<label>10</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Meliá]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[F]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Paî -Tavyterã: etnografía guaraní del Paraguay contemporáneo. 2da. ed]]></source>
+<year>2008</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[Centro de Estudios Antropólogicos de la Universidad Católica]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B11">
+<label>11</label><nlm-citation citation-type="book">
+<collab>FAO</collab>
+<source><![CDATA[Panorama de la seguridad alimentaria y nutricional en América Latina y el Caribe 2013]]></source>
+<year>2014</year>
+<publisher-name><![CDATA[FAO]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B12">
+<label>12</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Masi]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Sánchez Bernal]]></surname>
+<given-names><![CDATA[S]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Dallman]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Rodas]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Morinigo]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Mendoza]]></surname>
+<given-names><![CDATA[L]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Perfil nutricional de niños menores de 5 años que acuden a servicios públicos de salud en el Paraguay]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[INAN]]></publisher-name>
+</nlm-citation>
+</ref>
+</ref-list>
+</back>
+</article>
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
new file mode 100644
index 0000000..e6e48ac
--- /dev/null
+++ b/python/tests/test_html_ingest.py
@@ -0,0 +1,14 @@
+
+import datetime
+import pytest
+
+from sandcrawler.html_ingest import *
+
+
+def test_html_extract_ojs3() -> None:
+
+ with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f:
+ ojs3_html = f.read()
+
+ fulltext = html_extract_body_teixml(ojs3_html)
+ assert fulltext['status'] == 'success'
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
new file mode 100644
index 0000000..b428b0d
--- /dev/null
+++ b/python/tests/test_html_metadata.py
@@ -0,0 +1,227 @@
+
+import datetime
+import pytest
+
+from sandcrawler.html_metadata import *
+
+
+def test_html_metadata_plos() -> None:
+
+ with open('tests/files/plos_one_article.html', 'r') as f:
+ plos_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
+ assert meta is not None
+ assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ assert meta.doi == "10.1371/journal.pone.0213978"
+ assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert meta.contrib_names == [
+ "Yang Li",
+ "Tuanjie Wang",
+ "Lin Wang",
+ "Mingjun Sun",
+ "Zhizhong Cui",
+ "Shuang Chang",
+ "Yongping Wu",
+ "Xiaodong Zhang",
+ "Xiaohui Yu",
+ "Tao Sun",
+ "Peng Zhao",
+ ]
+ assert meta.container_name == "PLOS ONE"
+ assert meta.container_abbrev == "PLOS ONE"
+ # "Apr 22, 2019"
+ assert meta.release_date == datetime.date(year=2019, month=4, day=22)
+ assert meta.first_page == "e0213978"
+ assert meta.issue == "4"
+ assert meta.volume == "14"
+ assert meta.container_issn == "1932-6203"
+ assert meta.publisher == "Public Library of Science"
+ assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_elife() -> None:
+
+ with open('tests/files/elife_article.html', 'r') as f:
+ elife_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(elife_html))
+ assert meta is not None
+ assert meta.title == "Parallel visual circuitry in a basal chordate"
+ assert meta.doi == "10.7554/eLife.44753"
+ assert meta.contrib_names == [
+ "Matthew J Kourakis",
+ "Cezar Borba",
+ "Angela Zhang",
+ "Erin Newman-Smith",
+ "Priscilla Salas",
+ "B Manjunath",
+ "William C Smith",
+ ]
+ assert meta.container_name == "eLife"
+ # 2019-04-18
+ assert meta.release_date == datetime.date(year=2019, month=4, day=18)
+ assert meta.publisher == "eLife Sciences Publications Limited"
+
+
+def test_html_metadata_peerj() -> None:
+
+ with open('tests/files/peerj_oa_article.html', 'r') as f:
+ peerj_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
+ assert meta is not None
+ assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ assert meta.doi == "10.7717/peerj.4375"
+ assert meta.contrib_names == [
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
+ ]
+ assert meta.container_name == "PeerJ"
+ # "2018-02-13"
+ assert meta.release_date == datetime.date(year=2018, month=2, day=13)
+ assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url
+
+
+def test_html_metadata_nature() -> None:
+
+ with open('tests/files/nature_article.html', 'r') as f:
+ nature_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
+ assert meta is not None
+ assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
+ assert meta.doi == "10.1038/d41586-020-02610-z"
+ assert meta.contrib_names == [
+ "Diana Kwon",
+ ]
+ assert meta.container_name == "Nature"
+ # "2020-09-10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.publisher == "Nature Publishing Group"
+ # note: some error in dublin code in nature HTML resulting in duplication
+ assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+
+
+def test_html_metadata_ojs3() -> None:
+
+ with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ ojs3_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
+ assert meta is not None
+ assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
+ assert meta.doi == "10.5210/fm.v25i10.10274"
+ assert meta.contrib_names == [
+ "Calvin Liang",
+ "Jevan Alexander Hutson",
+ "Os Keyes",
+ ]
+ assert meta.container_name == "First Monday"
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_issn == "1396-0466"
+ # "2020/09/10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.lang == "en"
+ assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_dlib() -> None:
+
+ with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ dlib_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
+ assert meta is not None
+ assert meta.doi == "10.1045/may2017-vanhyning"
+ # "2017-05-15"
+ assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
+def test_html_metadata_dc_case() -> None:
+ """
+ This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
+ """
+
+ snippet = """
+ <html>
+ <head>
+ <meta name="DC.Citation.Issue" content="123"/>
+ </head>
+ <body>Hi.</body>
+ </html>"""
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(snippet))
+ assert meta is not None
+ assert meta.issue == "123"
+
+@pytest.fixture
+def adblock() -> Any:
+ return load_adblock_rules()
+
+def test_html_resources(adblock) -> None:
+
+ with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ dlib_html = f.read()
+
+ resources = html_extract_resources(
+ "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html",
+ HTMLParser(dlib_html),
+ adblock,
+ )
+
+ assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources
+
+ # check that adblock working
+ for r in resources:
+ assert '/ga.js' not in r['url']
+
+ with open('tests/files/plos_one_article.html', 'r') as f:
+ plos_html = f.read()
+
+ resources = html_extract_resources(
+ "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
+ HTMLParser(plos_html),
+ adblock,
+ )
+
+ # check that custom adblock working
+ for r in resources:
+ assert 'crossmark-cdn.crossref.org' not in r['url']
+
+ with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ monday_html = f.read()
+
+ resources = html_extract_resources(
+ "https://firstmonday.org/blah/",
+ HTMLParser(monday_html),
+ adblock,
+ )
+
+ with open('tests/files/elife_article.html', 'r') as f:
+ elife_html = f.read()
+
+ resources = html_extract_resources(
+ "https://elife.org/blah/",
+ HTMLParser(elife_html),
+ adblock,
+ )
+
+ with open('tests/files/nature_article.html', 'r') as f:
+ nature_html = f.read()
+
+ resources = html_extract_resources(
+ "https://nature.com/blah/",
+ HTMLParser(nature_html),
+ adblock,
+ )
+
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index ed93341..255e3fb 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -2,6 +2,7 @@
import pytest
import struct
import responses
+import poppler
from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
from sandcrawler.pdfextract import process_pdf
@@ -20,6 +21,7 @@ def test_process_fake_pdf():
resp = process_pdf(pdf_bytes)
assert resp.status == 'not-pdf'
+@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
def test_process_dummy_pdf():
with open('tests/files/dummy.pdf', 'rb') as f:
pdf_bytes = f.read()
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
new file mode 100644
index 0000000..a996c56
--- /dev/null
+++ b/python/tests/test_xml.py
@@ -0,0 +1,18 @@
+
+import pytest
+
+from sandcrawler.xml import xml_reserialize
+
+
+def test_xml_reserialize() -> None:
+
+ with open('tests/files/scielo_article.jats.xml', 'rb') as f:
+ raw_xml = f.read()
+
+ assert b'encoding="ISO-8859-1"' in raw_xml
+ raw_xml.decode("ISO-8859-1")
+ with pytest.raises(UnicodeDecodeError):
+ raw_xml.decode("utf-8")
+
+ str_xml = xml_reserialize(raw_xml)
+ assert 'encoding="UTF-8"' in str_xml
diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql
new file mode 100644
index 0000000..d089c7e
--- /dev/null
+++ b/sql/dump_unmatched_glutton_pdf.sql
@@ -0,0 +1,19 @@
+
+-- Run like:
+-- psql sandcrawler < THING.sql > THING.2019-09-23.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(grobid)
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL
+ LIMIT 1000
+)
+TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json';
+--TO STDOUT
+--WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 59423dd..73bd7f1 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -114,6 +114,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta (
-- encrypted
);
+CREATE TABLE IF NOT EXISTS html_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ scope TEXT CHECK (octet_length(status) >= 1),
+ has_teixml BOOLEAN NOT NULL,
+ has_thumbnail BOOLEAN NOT NULL,
+ word_count INT CHECK (word_count >= 0),
+ biblio JSONB,
+ resources JSONB
+ -- biblio JSON fields are similar to fatcat release schema
+ -- resources JSON object is a list of objects with keys like webcapture CDX schema
+);
+
CREATE TABLE IF NOT EXISTS ingest_request (
link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
@@ -128,6 +142,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (
-- ext_ids (source/source_id sometimes enough)
-- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
-- edit_extra
+ -- ingest type can be: pdf, xml, html
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md
index 1738731..1c872cc 100644
--- a/sql/monitoring_queries.md
+++ b/sql/monitoring_queries.md
@@ -39,6 +39,32 @@ Broken domains, past 30 days:
ORDER BY COUNT DESC
LIMIT 25;
+Summary of significant domains and status, past 7 days:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 500
+ ORDER BY domain ASC , count DESC;
+
+
Throughput per day, and success, for past 30 days:
SELECT ingest_request.ingest_type,