aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/.coveragerc1
-rw-r--r--python/.flake821
-rw-r--r--python/.gitignore13
-rw-r--r--python/.pylintrc2
-rw-r--r--python/Makefile16
-rw-r--r--python/Pipfile39
-rw-r--r--python/Pipfile.lock2196
-rw-r--r--python/README.md46
-rw-r--r--python/TODO7
-rw-r--r--python/example.env5
-rwxr-xr-xpython/grobid2json.py215
-rwxr-xr-xpython/grobid_tool.py151
-rwxr-xr-xpython/ia_pdf_match.py93
-rwxr-xr-xpython/ingest_file.py87
-rwxr-xr-xpython/ingest_tool.py244
-rwxr-xr-xpython/pdfextract_tool.py112
-rwxr-xr-xpython/pdftrio_tool.py112
-rwxr-xr-xpython/persist_tool.py240
-rw-r--r--python/pyproject.toml7
-rw-r--r--python/pytest.ini8
-rw-r--r--python/sandcrawler/__init__.py59
-rw-r--r--python/sandcrawler/db.py590
-rw-r--r--python/sandcrawler/fileset_platforms.py832
-rw-r--r--python/sandcrawler/fileset_strategies.py387
-rw-r--r--python/sandcrawler/fileset_types.py74
-rw-r--r--python/sandcrawler/grobid.py359
-rw-r--r--python/sandcrawler/html.py414
-rw-r--r--python/sandcrawler/html_metadata.py1077
-rw-r--r--python/sandcrawler/ia.py1067
-rw-r--r--python/sandcrawler/ingest.py446
-rw-r--r--python/sandcrawler/ingest_file.py925
-rw-r--r--python/sandcrawler/ingest_fileset.py516
-rw-r--r--python/sandcrawler/ingest_html.py499
-rw-r--r--python/sandcrawler/minio.py49
-rw-r--r--python/sandcrawler/misc.py241
-rw-r--r--python/sandcrawler/pdfextract.py389
-rw-r--r--python/sandcrawler/pdftrio.py101
-rw-r--r--python/sandcrawler/persist.py738
-rw-r--r--python/sandcrawler/workers.py446
-rw-r--r--python/sandcrawler/xml.py6
-rwxr-xr-xpython/sandcrawler_worker.py349
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py70
-rwxr-xr-xpython/scripts/archiveorg_fileset.py135
-rwxr-xr-xpython/scripts/cdx_collection.py82
-rwxr-xr-xpython/scripts/covid2ingestrequest.py79
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py94
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py193
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py182
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py144
-rwxr-xr-xpython/scripts/enrich_scored_matches.py19
-rwxr-xr-xpython/scripts/fetch_cdx_sha1hex.py170
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py127
-rwxr-xr-xpython/scripts/filter_groupworks.py48
-rwxr-xr-xpython/scripts/filter_scored_matches.py49
-rwxr-xr-xpython/scripts/grobid_affiliations.py37
-rwxr-xr-xpython/scripts/import_grobid_metadata.py69
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py36
-rwxr-xr-xpython/scripts/manifest_converter.py7
-rwxr-xr-xpython/scripts/oai2ingestrequest.py112
-rwxr-xr-xpython/scripts/pdf_thumbnail.py15
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py80
-rw-r--r--python/tests/files/crossref_api_work_978-3-030-64953-1_4.json1
-rw-r--r--python/tests/files/crossref_api_work_s1047951103000064.json1
-rw-r--r--python/tests/files/dlib_05vanhyning.html350
-rw-r--r--python/tests/files/first_monday_ojs3_fulltext.html441
-rw-r--r--python/tests/files/first_monday_ojs3_landingpage.html616
-rw-r--r--python/tests/files/genders_g58_fairlie.html146
-rw-r--r--python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml66
-rw-r--r--python/tests/files/grobid_refs_s1047951103000064.tei.xml499
-rw-r--r--python/tests/files/nature_article.html1379
-rw-r--r--python/tests/files/peerj_oa_article.html2365
-rw-r--r--python/tests/files/scielo_article.jats.xml336
-rw-r--r--python/tests/files/small.json7
-rw-r--r--python/tests/test_grobid.py199
-rw-r--r--python/tests/test_grobid2json.py26
-rw-r--r--python/tests/test_html.py38
-rw-r--r--python/tests/test_html_ingest.py10
-rw-r--r--python/tests/test_html_metadata.py261
-rw-r--r--python/tests/test_ingest.py258
-rw-r--r--python/tests/test_live_wayback.py54
-rw-r--r--python/tests/test_misc.py99
-rw-r--r--python/tests/test_pdfextract.py50
-rw-r--r--python/tests/test_pushers.py33
-rw-r--r--python/tests/test_savepagenow.py265
-rw-r--r--python/tests/test_wayback.py195
-rw-r--r--python/tests/test_xml.py17
l---------python/title_slug_denylist.txt (renamed from python/title_slug_blacklist.txt)0
87 files changed, 18594 insertions, 4045 deletions
diff --git a/python/.coveragerc b/python/.coveragerc
index 67053a7..51038d6 100644
--- a/python/.coveragerc
+++ b/python/.coveragerc
@@ -2,4 +2,3 @@
omit = tests/*
source =
sandcrawler
- grobid2json
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000..c7ef5fe
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,21 @@
+[flake8]
+select = C,E,F,W,ANN
+# ANN003 is annotation on, eg, **kwargs
+# ANN101 is annotation on 'self' (why would that be wanted?)
+# ANN204 is annotation on '__init__()'
+# ANN401 is 'Any' type
+# E265,E266 are restrictions on comments ('#')
+# E501 is line-too-long, which we enforce with black
+# W503,E203 are allowed by black
+# TODO: C901 is complexity, should be re-enabled at some point
+ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203
+per-file-ignores =
+ sandcrawler/__init__.py: F401
+ sandcrawler/ia.py: E402
+ tests/*.py: ANN201,ANN001,F403,F405
+ # TODO: add more annotations to CLI scripts
+ *_tool.py,sandcrawler_worker.py: ANN201,ANN001,ANN202,ANN206,ANN205,F403,F405
+ scripts: ANN201,ANN001,ANN202,ANN206,ANN205
+exclude = .git,__pycache__,.venv,scripts/
+max-line-length = 96
+max-complexity = 30
diff --git a/python/.gitignore b/python/.gitignore
index d53fac8..a5a773e 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,3 +1,14 @@
*part-000*
*.tar.gz
-*.tsv.gz
+*.gz
+htmlcov/
+samples/
+*.json
+TODO*
+*.tsv
+
+!.flake8
+!.gitlab-ci.yml
+!.pylintrc
+!.coveragerc
+!.gitignore
diff --git a/python/.pylintrc b/python/.pylintrc
index 80e203d..387bca1 100644
--- a/python/.pylintrc
+++ b/python/.pylintrc
@@ -11,4 +11,4 @@ include-ids=yes
notes=FIXME,XXX,DELETEME
[TYPECHECK]
-ignored-modules=responses
+extension-pkg-whitelist=selectolax,pydantic,responses
diff --git a/python/Makefile b/python/Makefile
index 43ec144..940a7eb 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -14,23 +14,19 @@ deps: ## Install dependencies using pipenv
.PHONY: lint
lint: ## Run lints (eg, flake8, mypy)
- #pipenv run flake8 . --exit-zero
- pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero
+ pipenv run flake8 . --exit-zero
+ pipenv run isort -q -c . || true
pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
- #pipenv run pytype sandcrawler/
.PHONY: fmt
fmt: ## Run code formating on all source code
- pipenv run black *.py sandcrawler/ tests/
+ pipenv run isort --atomic .
+ pipenv run black --line-length 96 sandcrawler/ tests/ scripts/ *.py
.PHONY: test
-test: lint ## Run all tests and lints
+test: ## Run all tests and lints
pipenv run pytest
.PHONY: coverage
coverage: ## Run all tests with coverage
- pipenv run pytest --cov
-
-.PHONY: coverage-html
-coverage-html: ## Run all tests with coverage, HTML report output
- pipenv run pytest --cov --cov-report html
+ pipenv run pytest --cov --cov-report=term --cov-report=html
diff --git a/python/Pipfile b/python/Pipfile
index 17734ad..b841755 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,6 +1,6 @@
[[source]]
name = "ia"
-url = "https://devpi.archive.org/wb/prod"
+url = "https://devpi.us.archive.org/wb/prod"
verify_ssl = true
[[source]]
@@ -18,38 +18,51 @@ pytest-mock = "*"
pylint = "*"
ipython = "*"
mypy = "*"
-pytype = "*"
flake8 = "*"
flake8-annotations = "*"
+isort = "*"
+types-requests = "*"
+types-beautifulsoup4 = "*"
+types-dateparser = "*"
+types-psycopg2 = "*"
+types-Pillow = "*"
+black = "*"
[packages]
requests = ">=2"
-raven = {extras = ['flask'],version = "*"}
confluent-kafka = "*"
python-snappy = "*"
boto3 = "*"
-minio = "*"
+minio = "<7.0.0"
psycopg2 = "*"
bs4 = "*"
python-magic = "*"
ftfy = "*"
internetarchive = "*"
-Flask = ">=1"
urlcanon = "*"
-pillow = ">=3"
+Pillow = ">=3"
python-poppler = ">=0.2.1"
-
-# must lock black to an exact version because it is still "beta"
-# see: https://github.com/psf/black/issues/517
-black = "==19.10b0"
+selectolax = ">=0.2"
+# constraining trafilatura to prevent a version conflict with
+# `charset_normalizer`, between htmldate and requests
+trafilatura = ">=1,<1.4"
+htmldate= ">=1,<1.4"
+pydantic = ">=1.7"
+dateparser = "*"
+braveblock = "*"
+dynaconf = ">=3"
+sentry-sdk = { version = ">=0.14.0", extras = [] }
+zstandard = "*"
+grobid_tei_xml = ">=0.1.2,<0.2.0"
+PyMuPDF = ">=1.19.0,<1.20.0"
[requires]
-python_version = "3.7"
+python_version = "3.8"
[packages.globalwayback]
-version = ">=0.3"
+version = ">=0.6.5"
index = "ia"
[packages.wayback]
-version = ">=0.2.1.2"
+version = ">=0.6.3"
index = "ia"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index fcc1434..546a420 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,16 +1,16 @@
{
"_meta": {
"hash": {
- "sha256": "0710cce29b75fe2092b0bf2cbbe758688e6ffb34dc26a01fc769007bd1c66f2c"
+ "sha256": "35d0f0cd2f3903cce19d5a73f50a89ba09a1b43abbda84894fd45411d7f32760"
},
"pipfile-spec": 6,
"requires": {
- "python_version": "3.7"
+ "python_version": "3.8"
},
"sources": [
{
"name": "ia",
- "url": "https://devpi.archive.org/wb/prod",
+ "url": "https://devpi.us.archive.org/wb/prod",
"verify_ssl": true
},
{
@@ -21,117 +21,164 @@
]
},
"default": {
- "aerospike": {
- "hashes": [
- "sha256:3c3edb9c59491100cf5f9b0d802ee0b812b32b626c16358133cf5b9931ab8620",
- "sha256:42e6ed4f6298aab4e5094f45a69fc805f925fbaa4ec206a87ce0a2048df02d4d",
- "sha256:67684fb6af531765eb6061e37597bc73a348a2eff141795447ab20d9c6a61289",
- "sha256:6aec5e0dbedb8ddd97441abaebedb04d4abbd51bfcfd6f0a6722fabc5be4efd0",
- "sha256:9280ecb0257b0b706df7ac934dc03f518641934479d9c925a46af5231fb65f40",
- "sha256:98779725a86ef345b9fec0b5ef60b59b2430b9c8c8e8904adb7945af6d6f9ffb",
- "sha256:99de79a26f184a47a67123899e093cecd5c3bc0b0ce92da4f302684ad0b0116c",
- "sha256:b170b637d69f49c02d021477359866c3d89a2c0d1477bec19343828f890d3cb1",
- "sha256:d2f0b0288e2efafb99bbada6b39714285f317dc47fde3c4458b76e8cfbb71c11",
- "sha256:d83faa27d40af320058a93902e19173f6295acbcc9ca225c552d7648169859f0",
- "sha256:db7efad41300cb9bd6e70534c3110fce4e474db6d4288428609e0214a021aab8"
- ],
- "version": "==3.10.0"
- },
- "appdirs": {
- "hashes": [
- "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41",
- "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"
- ],
- "version": "==1.4.4"
- },
- "attrs": {
- "hashes": [
- "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
- "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
- ],
- "version": "==19.3.0"
- },
- "backports.csv": {
- "hashes": [
- "sha256:1277dfff73130b2e106bf3dd347adb3c5f6c4340882289d88f31240da92cbd6d",
- "sha256:21f6e09bab589e6c1f877edbc40277b65e626262a86e69a70137db714eaac5ce"
- ],
- "version": "==1.0.7"
+ "async-timeout": {
+ "hashes": [
+ "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15",
+ "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.2"
+ },
+ "backports.zoneinfo": {
+ "hashes": [
+ "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
+ "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
+ "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
+ "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
+ "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
+ "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
+ "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
+ "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
+ "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
+ "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
+ "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
+ "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
+ "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
+ "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
+ "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
+ "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
+ ],
+ "markers": "python_version < '3.9' and python_version >= '3.6' and python_version < '3.9'",
+ "version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
- "sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7",
- "sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8",
- "sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c"
+ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
+ "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
],
- "version": "==4.9.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.11.1"
},
- "black": {
+ "boto3": {
"hashes": [
- "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b",
- "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"
+ "sha256:7a6766c7177a9c6f85365e02aabd96ca4d72e08bc5cb127cb51b0a97ac9b9d1b",
+ "sha256:82b790b1dabd0746b028d2013b5d4d636a41f3aaf25520081f4c173cb6eb395d"
],
"index": "ia",
- "version": "==19.10b0"
+ "version": "==1.26.37"
},
- "blinker": {
+ "botocore": {
"hashes": [
- "sha256:471aee25f3992bd325afa3772f1063dbdbbca947a041b8b89466dc00d606f8b6"
+ "sha256:18ab8e95345a6d0d2653ce65d261a0aef6fef8a57a35a89e3cea6ffe315e92fc",
+ "sha256:3afa4fec9f7713caa05116563b38f81bec7bd20585d517155484d3f25efab5aa"
],
- "version": "==1.4"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.29.37"
},
- "boto3": {
+ "braveblock": {
"hashes": [
- "sha256:16f83ca3aa98d3faeb4f0738b878525770323e5fb9952435ddf58ca09aacec7c",
- "sha256:dc87ef82c81d2938f91c7ebfa85dfd032fff1bd3b67c9f66d74b21f8ec1e353d"
+ "sha256:0bfca14473275366f2f822751c4e8dde7f94ee5ce8a9372244870452458f4fe1",
+ "sha256:107050b2e1c885b748727573a54a85d2e1ea9ad86146370f6eb79ca18b9673d4",
+ "sha256:13f9769eac9c4027eba2f400e635572796f7a7feb343f442d13c4b78e7d6f536",
+ "sha256:14efeada36418525da7c3b26393041b85242ffa1165328ec7eaf9b9780b72d62",
+ "sha256:1ab6980d10b8a02fd0dc73e28f18a0a3e17be636d314c1fdaa3bbb3e36a81f0f",
+ "sha256:45286418a43a3dfab50bdaf922f5003dbd2c3d1f696d23883568f4fa14b8093e",
+ "sha256:66c2442154102bff8df9c6f05cb72cd5cda6f4e1ed88592800ab1b6e8100e806",
+ "sha256:73de4f925ae5442d3361a71d7c0eeb1b4c540bf3d0c91100a00325ccef9e743c",
+ "sha256:80cbeeb6d083bc2a9106214188e5ce05362f248c1051344dc6673b7b38a561da",
+ "sha256:8460b10c9b82cc9d0b6056e1fe206bea209fe5a83ba87bdf9486305657224a44",
+ "sha256:903c506fc05eb6b76e4d31f957c1118078582db80f8ef5ce5ac74418f094d498",
+ "sha256:dcb773e3e275de896efebe57159a67587283d6ca1d1a36695170a3756fd2ef3a"
],
"index": "ia",
- "version": "==1.14.10"
- },
- "botocore": {
- "hashes": [
- "sha256:b22db58da273b77529edef71425f9c281bc627b1b889f81960750507238abbb8",
- "sha256:cb0d7511a68439bf6f16683489130e06c5bbf9f5a9d647e0cbf63d79f3d3bdaa"
- ],
- "version": "==1.17.10"
+ "version": "==0.3.0"
},
"brotli": {
"hashes": [
- "sha256:0538dc1744fd17c314d2adc409ea7d1b779783b89fd95bcfb0c2acc93a6ea5a7",
- "sha256:0970a47f471782912d7705160b2b0a9306e68e6fadf9cffcaeb42d8f0951e26c",
- "sha256:113f51658e6fe548dce4b3749f6ef6c24de4184ba9c10a909cbee4261c2a5da0",
- "sha256:1e1aa9c4d1558889f42749c8baf846007953bfd32c8209230cf1cd1f5ef33495",
- "sha256:2f2f4f78f29ac4a45d15b3d9fc3fd9705e0ad313a44b129f6e1d0c6916bad0e2",
- "sha256:3269f6de1dd150fd0cce1c158b61ff5ac06d627fd3ae9c6ea03aed26fbbff7ea",
- "sha256:3f4a1f6240916c7984c7f2542786710f622992508dafee0b1714e6d340fb9ffd",
- "sha256:50dd9ad2a2bb12da4e9002a438672d182f98e546e99952de80280a1e1729664f",
- "sha256:5519a4b01b1a4f965083cbfa2ef2b9774c5a5f352341c47b50776ad109423d72",
- "sha256:5eb27722d320370315971c427eb8aa7cc0791f2a458840d357ac653bd0ad3a14",
- "sha256:5f06b4d5b6f58e5b5c220c2f23cad034dc5efa51b01fde2351ced1605bd980e2",
- "sha256:71ceee286ea7ec613f1c36f1c6181864a6ca24ebb55e371276f33d6af8742834",
- "sha256:72848d25a5f9e736db4af4512e0c3feecc094d57d241f8f1ae959115a2c39756",
- "sha256:743001bca75f4a6b4454be3510feca46f9d61a0c782a9bc2bc684bdb245e279e",
- "sha256:7ac98c71a15648fd11bc1f32608b6110e396121280790082e32b9a3109048bc6",
- "sha256:9d1c2dd27a1083fefd05b1b2f8df4a6bc2aaa6c21dd82cd41c8ae5e7c23a87f8",
- "sha256:a13ce9b419fe9f277c63f700efb0e444331509d1881b5610d2ba7e9080606967",
- "sha256:a19ef0952b9d2803df88dff07f45a6c92d5676afb9b8d69cf32232d684036d11",
- "sha256:ad766ca8b8c1419b71a22756b45264f45725c86133dc80a7cbe30b6b78c75620",
- "sha256:ad7963f261988ee0883816b6b9f206f11461c9b3cb5cfbca0c9ab5adc406d395",
- "sha256:af0451e23016631a2f52925a10d738ac4a0f794ac315c30380b22efc0c90cbc6",
- "sha256:c16201060c5a3f8742e3deae759014251ac92f382f82bc2a41dc079ff18c3f24",
- "sha256:c43b202f65891861a9a336984a103de25de235f756de69e32db893156f767013",
- "sha256:c675c6cce4295cb1a692f3de7416aacace7314e064b94bc86e93aceefce7fd3e",
- "sha256:d17cec0b992b1434f5f9df9986563605a4d1b1acd5574c87fc2ac014bcbd3316",
- "sha256:dc91f6129953861a73d9a65c52a8dd682b561a9ebaf65283541645cab6489917",
- "sha256:e2f4cbd1760d2bf2f30e396c2301999aab0191aec031a6a8a04950b2f575a536",
- "sha256:f192e6d3556714105c10486bbd6d045e38a0c04d9da3cef21e0a8dfd8e162df4",
- "sha256:f775b07026af2b1b0b5a8b05e41571cdcf3a315a67df265d60af301656a5425b",
- "sha256:f969ec7f56ba9636679e69ca07fba548312ccaca37412ee823c7f413541ad7e0",
- "sha256:f9dc52cd70907aafb99a773b66b156f2f995c7a0d284397c487c8b71ddbef2f9",
- "sha256:f9ee88bb52352588ceb811d045b5c9bb1dc38927bc150fd156244f60ff3f59f1",
- "sha256:fc7212e36ebeb81aebf7949c92897b622490d7c0e333a479c0395591e7994600"
- ],
- "version": "==1.0.7"
+ "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019",
+ "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df",
+ "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
+ "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
+ "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
+ "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
+ "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
+ "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
+ "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
+ "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
+ "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
+ "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
+ "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be",
+ "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be",
+ "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a",
+ "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
+ "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
+ "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7",
+ "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad",
+ "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679",
+ "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
+ "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
+ "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
+ "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
+ "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
+ "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
+ "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
+ "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
+ "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337",
+ "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
+ "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
+ "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
+ "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
+ "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
+ "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
+ "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7",
+ "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
+ "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
+ "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
+ "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
+ "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
+ "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
+ "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
+ "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
+ "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
+ "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f",
+ "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
+ "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
+ "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
+ "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
+ "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
+ "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
+ "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
+ "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde",
+ "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
+ "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f",
+ "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8",
+ "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
+ "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d",
+ "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
+ "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755",
+ "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a",
+ "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d",
+ "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a",
+ "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
+ "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
+ "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
+ "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
+ "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a",
+ "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
+ "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
+ "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
+ "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
+ "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
+ "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
+ ],
+ "version": "==1.0.9"
},
"bs4": {
"hashes": [
@@ -142,76 +189,141 @@
},
"certifi": {
"hashes": [
- "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3",
- "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2020.6.20"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
"chardet": {
"hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5",
+ "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"
],
- "version": "==3.0.4"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.1.0"
},
- "click": {
+ "charset-normalizer": {
"hashes": [
- "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
- "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==7.1.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
"configparser": {
"hashes": [
- "sha256:2ca44140ee259b5e3d8aaf47c79c36a7ab0d5e94d70bd4105c03ede7a20ea5a1",
- "sha256:cffc044844040c7ce04e9acd1838b5f2e5fa3170182f6fda4d2ea8b0099dbadd"
+ "sha256:8be267824b541c09b08db124917f48ab525a6c3e837011f3130781a224c57090",
+ "sha256:b065779fd93c6bf4cee42202fa4351b4bb842e96a3fb469440e484517a49b9fa"
],
- "version": "==5.0.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.3.0"
},
"confluent-kafka": {
"hashes": [
- "sha256:1b10a9e4ede8c7ee382c16075b55275963d3fe9b8eec3fc511d0868847cc6eed",
- "sha256:1c46cbc2eb0876f0cdbd33ed7ea684ed1b009a25b65cf87736d3506d2f4ae57e",
- "sha256:2500a78334d642e49b98710722e548c0e3d5dc4c6eae63f02d66448678ed2922",
- "sha256:2515771b18d190df2182881abcf02fe8fde0aab567402ff36295b35cd495de65",
- "sha256:3150c8875511e2cea4086206f3c10448f744c9c35f9033fd0874c8c55f7b87e2",
- "sha256:4b0a3c47f9183570e9ee77ae8c36080fbc1996045251e25772944e4dadf1db21",
- "sha256:4f875798bbc766767b9c6ed95b084fde851e0bf074527ab0daffa87f4e750635",
- "sha256:515049659b045b99e0464d5ff5def4785478490563bc5ac1341a4f29dc335e82",
- "sha256:52088adf1abdf3a384a54ec7a3bfaa0b61e5da8cc03a2e26a8351bbbf49f72a9",
- "sha256:5342d3ff348b8082eaa4c63f4c82a72f3bf0ef8efa12a8580c890fa6e160f761",
- "sha256:55734905c5a8642e596cf1e60ec4d86f05d31a185cbc71d1c73430bb0c08db19",
- "sha256:624349587e97135996383c58edd8d53b38c57d653e6536c1f816049fc75faea3",
- "sha256:804a7d71b3cb61444930af67986064c9555b8c33f05a27003ea314d6c847e522",
- "sha256:931231853cec933addfafa27772177dcfab899d82e2e39fe7485c0602088daf7",
- "sha256:a4f5edc1d7958bbf5f12ba83c1f83e22a66daa9c4318c7f28c5bb1db9289fe09",
- "sha256:a591936a90095144451f041315239b2c823b7a15fa820cf45e45c422591345d6",
- "sha256:a6eb8f3f553e98a6ef0d00f9cf8e4e8dde73c914a43a00fecef97330de80bcea",
- "sha256:aa48215edcf16071d44ba29951c82c5f541d5ec915590aff0b4240e8e13f3ba3",
- "sha256:bfacb9fa0e3a5e31a5ac9a5da15de656e95e7153e022ec5620095b76a6098ec0",
- "sha256:bfbcbe7068690369ac2de3fe953854de34ad5e901157e96bcb990ca8b86d1d93",
- "sha256:c2660807e5c1ecd723e280f76918794c3fd84595000c1e8de1f254f5d89a785c",
- "sha256:c42ff838ee5e248f95f65b5adca4e2fdd4a2817fa26cede36d83a426e0f1370c",
- "sha256:c5b741764d8ea2b8334fdaf4b56297c5bab780142f1c0cad0bd642cac30cb89e",
- "sha256:dac33a04f73093de275953867a05de244560aa9842def6316cbb52bc0f02eff3",
- "sha256:f1695a00789795f9f798588bb62688b563baf471a76ca20fa01c957844938d7d",
- "sha256:f25836e03559a381ba74b9a6940b716e61ba8ae2db2d5d3a40accbc60617e1af"
+ "sha256:24872e3e427b16f77461ae7e6cf48f9c5c03c06884ac51bad179580a4dd29145",
+ "sha256:2fb97bd25d436bd59fe079885aa77a3a2f23cface9c6359d4700053665849262",
+ "sha256:3207c76d1510571cbda85560c293dec5f8d6645103b3f471abab5c83e51a7ccd",
+ "sha256:344a7fec57d3348002392a7bd5cf66fb9dbe4a103e81636037cccd6fff944e28",
+ "sha256:382739e499deaf488459c2307ebcc0e9b3653340801d6053c207c84ad710ee8d",
+ "sha256:4d6bfcc352cd608fcf325037b4425c0edaeae0c6a5439423a865110b59f897e9",
+ "sha256:4f27ddf7daf630a95e1d7dfddd0c8cf8a7755c9567dc9851bf2e74c22f34af42",
+ "sha256:5b24587b30a4d288a7b1c5cc756ee707fc1293fa28454f8db40267ed9d7e73c8",
+ "sha256:6ab745babc33a864e3ca3a2659c005ed52503e39936fff5812eeb21920009c8b",
+ "sha256:7e6592533b3f8cfbc086ea2d472058f10e5f6a04a388edca01773285c63284b4",
+ "sha256:b9ad6ad9d58c2735129f94f044b2236d7de87d77a101c8c630363add12d62a4a",
+ "sha256:be7b37020f614017d8a047565b3fb61ceef9c30a9ee093f9373d06a4c32068ae",
+ "sha256:bef263b6d78a3e63399e1b82cc07cbb30af762884df96a369cba0e1011405344",
+ "sha256:c4b7c4d0b647952d2b506948131d6e7e1c42ccb16aac8e3e52369c16b94e7215",
+ "sha256:d036bf5e1d7cb3743125d7caf62b1a23b12e403de240144b6117ddbb8f815a33",
+ "sha256:d0cbf8e7510497afd651e134bccb9d579aa90234e45734046fcb6b752d2ee312",
+ "sha256:d533ea0e527122f177943ee35eb356b8d9f7af35fe357e0cdc0514d95804aaea",
+ "sha256:e41b9313c44f54a3cd29b0e95fa32a8e685edaa9287b338f59530b21ebc0b453",
+ "sha256:e9107767cc9240cbf9b5c0fdded5eeead86a1690d1c15de6cbbdcc9d7e3b1962",
+ "sha256:f96033c335da26ea1716ab9adfce459c211b023ca09528f958fb28bf099fc0df",
+ "sha256:f970a2c6d22c934ea68d645abcc96056ecb107489f28a38b2171f65655b7e41f",
+ "sha256:fe31b3b6930d67380df371f5088950f93da5fac580cde3bedb35f992b2498e1b",
+ "sha256:ff08b9f978f8b37f2961614a68f9fdb4fabd10cdd940234e80200806d93a1c30",
+ "sha256:ff4d1557b7fb72e752c36205a344863b8f4f23b3a834780fc36eb7ebde614de7"
],
"index": "ia",
- "version": "==1.4.2"
+ "version": "==1.9.2"
},
"contextlib2": {
"hashes": [
- "sha256:01f490098c18b19d2bd5bb5dc445b2054d2fa97f09a4280ba2c5f3c394c8162e",
- "sha256:3355078a159fbb44ee60ea80abd0d87b80b78c248643b49aa6d94673b413609b"
+ "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f",
+ "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"
],
- "version": "==0.6.0.post1"
+ "markers": "python_version >= '3.6'",
+ "version": "==21.6.0"
+ },
+ "courlan": {
+ "hashes": [
+ "sha256:d06c5b048b2b5cd5c0ac77304dc24b795e4bb257a7b6077ea405a3b5e99ae179",
+ "sha256:d141d30f8e52d344cf9904aa29e4d8750e934026bdbca2dc7bd58b750566d058"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
"crawllib": {
"hashes": [
- "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
+ "sha256:9a30a10318dc706f1e27ff0af950ac14a77f73c18d329771f44d872fd63630e3"
+ ],
+ "version": "==0.1.6"
+ },
+ "cython": {
+ "hashes": [
+ "sha256:061e25151c38f2361bc790d3bcf7f9d9828a0b6a4d5afa56fbed3bd33fb2373a",
+ "sha256:06be83490c906b6429b4389e13487a26254ccaad2eef6f3d4ee21d8d3a4aaa2b",
+ "sha256:07d173d3289415bb496e72cb0ddd609961be08fe2968c39094d5712ffb78672b",
+ "sha256:0bbc27abdf6aebfa1bce34cd92bd403070356f28b0ecb3198ff8a182791d58b9",
+ "sha256:0ea8267fc373a2c5064ad77d8ff7bf0ea8b88f7407098ff51829381f8ec1d5d9",
+ "sha256:3875c2b2ea752816a4d7ae59d45bb546e7c4c79093c83e3ba7f4d9051dd02928",
+ "sha256:39afb4679b8c6bf7ccb15b24025568f4f9b4d7f9bf3cbd981021f542acecd75b",
+ "sha256:3f85eb2343d20d91a4ea9cf14e5748092b376a64b7e07fc224e85b2753e9070b",
+ "sha256:40eff7aa26e91cf108fd740ffd4daf49f39b2fdffadabc7292b4b7dc5df879f0",
+ "sha256:479690d2892ca56d34812fe6ab8f58e4b2e0129140f3d94518f15993c40553da",
+ "sha256:4a4b03ab483271f69221c3210f7cde0dcc456749ecf8243b95bc7a701e5677e0",
+ "sha256:513e9707407608ac0d306c8b09d55a28be23ea4152cbd356ceaec0f32ef08d65",
+ "sha256:5514f3b4122cb22317122a48e175a7194e18e1803ca555c4c959d7dfe68eaf98",
+ "sha256:5ba622326f2862f9c1f99ca8d47ade49871241920a352c917e16861e25b0e5c3",
+ "sha256:63b79d9e1f7c4d1f498ab1322156a0d7dc1b6004bf981a8abda3f66800e140cd",
+ "sha256:656dc5ff1d269de4d11ee8542f2ffd15ab466c447c1f10e5b8aba6f561967276",
+ "sha256:67fdd2f652f8d4840042e2d2d91e15636ba2bcdcd92e7e5ffbc68e6ef633a754",
+ "sha256:79e3bab19cf1b021b613567c22eb18b76c0c547b9bc3903881a07bfd9e7e64cf",
+ "sha256:856d2fec682b3f31583719cb6925c6cdbb9aa30f03122bcc45c65c8b6f515754",
+ "sha256:8669cadeb26d9a58a5e6b8ce34d2c8986cc3b5c0bfa77eda6ceb471596cb2ec3",
+ "sha256:8733cf4758b79304f2a4e39ebfac5e92341bce47bcceb26c1254398b2f8c1af7",
+ "sha256:97335b2cd4acebf30d14e2855d882de83ad838491a09be2011745579ac975833",
+ "sha256:afbce249133a830f121b917f8c9404a44f2950e0e4f5d1e68f043da4c2e9f457",
+ "sha256:b0595aee62809ba353cebc5c7978e0e443760c3e882e2c7672c73ffe46383673",
+ "sha256:b6da3063c5c476f5311fd76854abae6c315f1513ef7d7904deed2e774623bbb9",
+ "sha256:c8e8025f496b5acb6ba95da2fb3e9dacffc97d9a92711aacfdd42f9c5927e094",
+ "sha256:cddc47ec746a08603037731f5d10aebf770ced08666100bd2cdcaf06a85d4d1b",
+ "sha256:cdf10af3e2e3279dc09fdc5f95deaa624850a53913f30350ceee824dc14fc1a6",
+ "sha256:d968ffc403d92addf20b68924d95428d523436adfd25cf505d427ed7ba3bee8b",
+ "sha256:dbee03b8d42dca924e6aa057b836a064c769ddfd2a4c2919e65da2c8a362d528",
+ "sha256:e1958e0227a4a6a2c06fd6e35b7469de50adf174102454db397cec6e1403cce3",
+ "sha256:e6ffa08aa1c111a1ebcbd1cf4afaaec120bc0bbdec3f2545f8bb7d3e8e77a1cd",
+ "sha256:e83228e0994497900af954adcac27f64c9a57cd70a9ec768ab0cb2c01fd15cf1",
+ "sha256:ea1dcc07bfb37367b639415333cfbfe4a93c3be340edf1db10964bc27d42ed64",
+ "sha256:eca3065a1279456e81c615211d025ea11bfe4e19f0c5650b859868ca04b3fcbd",
+ "sha256:ed087eeb88a8cf96c60fb76c5c3b5fb87188adee5e179f89ec9ad9a43c0c54b3",
+ "sha256:eeb475eb6f0ccf6c039035eb4f0f928eb53ead88777e0a760eccb140ad90930b",
+ "sha256:eefd2b9a5f38ded8d859fe96cc28d7d06e098dc3f677e7adbafda4dcdd4a461c",
+ "sha256:f3fd44cc362eee8ae569025f070d56208908916794b6ab21e139cea56470a2b3",
+ "sha256:f9944013588a3543fca795fffb0a070a31a243aa4f2d212f118aa95e69485831"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.29.32"
+ },
+ "dateparser": {
+ "hashes": [
+ "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
+ "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
],
- "version": "==0.1.4.8"
+ "index": "ia",
+ "version": "==1.1.4"
},
"dawg": {
"hashes": [
@@ -230,10 +342,11 @@
},
"decorator": {
"hashes": [
- "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
- "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==4.4.2"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
},
"docopt": {
"hashes": [
@@ -241,54 +354,72 @@
],
"version": "==0.6.2"
},
- "docutils": {
- "hashes": [
- "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0",
- "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827",
- "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99"
- ],
- "version": "==0.15.2"
- },
"dogpile.cache": {
"hashes": [
"sha256:bc9dde1ffa5de0179efbcdc73773ef0553921130ad01955422f2932be35c059e"
],
"version": "==0.9.2"
},
+ "dynaconf": {
+ "hashes": [
+ "sha256:87e0b3b12b5db9e8fb465e1f8c7fdb926cd2ec5b6d88aa7f821f316df93fb165",
+ "sha256:d9cfb50fd4a71a543fd23845d4f585b620b6ff6d9d3cc1825c614f7b2097cb39"
+ ],
+ "index": "ia",
+ "version": "==3.1.11"
+ },
"elasticsearch": {
"hashes": [
- "sha256:540d633afcc0a32972e4b489c4559c9a96e294850853238f7a18b1cbd267c2ed",
- "sha256:a8062a00b61bc7babeea028530667583a68ecb1a9f59ab0b22ff7feaf70d3564"
+ "sha256:840adeb45a5ec9102a83f3cf481aae83a3775b75d6dd83a7310b04e44a5d0308",
+ "sha256:f511ea92e96db09b0e96b0de5fbbb7aa5c3740b0c571a364a2c3a1cc7ec06203"
],
- "version": "==6.8.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'",
+ "version": "==7.17.8"
},
- "flask": {
+ "filelock": {
"hashes": [
- "sha256:4efa1ae2d7c9865af48986de8aeb8504bf32c7f3d6fdc9353d34b21f4b127060",
- "sha256:8a4fdd8936eba2512e9c85df320a37e694c93945b33ef33c89946a340a238557"
+ "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
+ "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
],
- "index": "ia",
- "version": "==1.1.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.8.2"
},
"ftfy": {
"hashes": [
- "sha256:67f9c8b33a4b742376a3eda11b0e3bd5c0cbe719d95ea0bfd3736a7bdd1c24c8"
+ "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca",
+ "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"
],
"index": "ia",
- "version": "==5.7"
+ "version": "==6.1.1"
},
"globalwayback": {
"hashes": [
- "sha256:46724c1445afa79f6e2d2ccf98e76eed072ff36df50409ed90ff26344a4b4ac4"
+ "sha256:683f19dee720ef11335952aa33615e50c945196c82e18a5d8150635f92022d23"
],
"index": "ia",
- "version": "==0.6.1"
+ "version": "==0.8.12.6"
+ },
+ "grobid-tei-xml": {
+ "hashes": [
+ "sha256:022fdf54dbd067b520c1effe3c1a1f2ac248492ea310627e9462757748cb461b",
+ "sha256:35c9afb14f6f76100dce5f5815e67ec9fa4122e2f268394e0baf6eafbd8668d8"
+ ],
+ "index": "ia",
+ "version": "==0.1.3"
+ },
+ "htmldate": {
+ "hashes": [
+ "sha256:603b86eaf0f076efcd653d57fe0470305f751417711f4e373279235d0ff587e6",
+ "sha256:83830715faf0f22272d9e24e571a4955308a008107d0ca9359c0de77b99766cd"
+ ],
+ "index": "ia",
+ "version": "==1.3.2"
},
"ialib": {
"hashes": [
- "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
+ "sha256:0b1745e512266fd6c91af68763f2f8427eec6c92c5009fc75c50d9352fc764fc"
],
- "version": "==0.3.0.1"
+ "version": "==0.5.1.1"
},
"idna": {
"hashes": [
@@ -299,100 +430,201 @@
},
"internetarchive": {
"hashes": [
- "sha256:6071c5be1a4f933af9e2dfa015cc0d63e79c404cfa29ae26121e54181079c947",
- "sha256:bad1c4152fb6286ce7c77737a853bb4e45bcefb89ca5834d75607419f08cb6fe"
+ "sha256:de856465c2ef6852184d08bfd59c0ca01904865b373a27b383034ac6b4128eb6"
],
"index": "ia",
- "version": "==1.9.3"
- },
- "itsdangerous": {
- "hashes": [
- "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
- "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
- ],
- "version": "==1.1.0"
+ "version": "==3.0.2"
},
"jinja2": {
"hashes": [
- "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
- "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
+ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
+ "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
- "version": "==2.11.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.1.2"
},
"jmespath": {
"hashes": [
- "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
- "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
+ "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",
+ "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"
],
- "version": "==0.10.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.0.1"
},
"jsonpatch": {
"hashes": [
- "sha256:83ff23119b336ea2feffa682307eb7269b58097b4e88c089a4950d946442db16",
- "sha256:e45df18b0ab7df1925f20671bbc3f6bd0b4b556fb4b9c5d97684b0a7eac01744"
+ "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397",
+ "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"
],
- "version": "==1.26"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.32"
},
"jsonpointer": {
"hashes": [
- "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
- "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
- ],
- "version": "==2.0"
+ "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9",
+ "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==2.3"
+ },
+ "justext": {
+ "hashes": [
+ "sha256:7640e248218795f6be65f6c35fe697325a3280fcb4675d1525bcdff2b86faadf",
+ "sha256:86b48f5b1d99505acd072f5831def6cd3f1306043651c524a1c609e62e3544e4"
+ ],
+ "version": "==3.0.0"
+ },
+ "langcodes": {
+ "hashes": [
+ "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69",
+ "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==3.3.0"
+ },
+ "lxml": {
+ "hashes": [
+ "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7",
+ "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726",
+ "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03",
+ "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140",
+ "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a",
+ "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05",
+ "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03",
+ "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419",
+ "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4",
+ "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e",
+ "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67",
+ "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50",
+ "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894",
+ "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf",
+ "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947",
+ "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1",
+ "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd",
+ "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3",
+ "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92",
+ "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3",
+ "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457",
+ "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74",
+ "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf",
+ "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1",
+ "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4",
+ "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975",
+ "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5",
+ "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe",
+ "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7",
+ "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1",
+ "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2",
+ "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409",
+ "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f",
+ "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f",
+ "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5",
+ "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24",
+ "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e",
+ "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4",
+ "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a",
+ "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c",
+ "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de",
+ "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f",
+ "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b",
+ "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5",
+ "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7",
+ "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a",
+ "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c",
+ "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9",
+ "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e",
+ "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab",
+ "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941",
+ "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5",
+ "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45",
+ "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7",
+ "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892",
+ "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746",
+ "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c",
+ "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53",
+ "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe",
+ "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184",
+ "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38",
+ "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df",
+ "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9",
+ "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b",
+ "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2",
+ "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0",
+ "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda",
+ "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b",
+ "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5",
+ "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380",
+ "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33",
+ "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8",
+ "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1",
+ "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889",
+ "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9",
+ "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f",
+ "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==4.9.2"
},
"markupsafe": {
"hashes": [
- "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
- "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
- "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
- "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
- "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
- "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
- "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
- "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
- "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
- "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
- "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
- "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
- "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
- "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
- "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
- "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
- "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
- "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
- "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
- "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
- "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
- "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
- "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
- "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
- "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
- "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
- "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
- "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
- "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
- "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
- "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
- ],
- "version": "==1.1.1"
+ "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
+ "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
+ "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
+ "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
+ "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
+ "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
+ "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
+ "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
+ "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
+ "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
+ "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
+ "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
+ "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
+ "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
+ "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
+ "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
+ "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
+ "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
+ "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
+ "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
+ "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
+ "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
+ "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
+ "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
+ "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
+ "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
+ "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
+ "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
+ "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
+ "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
+ "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
+ "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
+ "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
+ "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
+ "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
+ "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
+ "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
+ "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
+ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
+ "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.1"
},
"minio": {
"hashes": [
- "sha256:6ecb7637a35f806733e9d112eacfa599a58d7c3d4698fda2b5c86fff5d34b417",
- "sha256:71984a47fc8268afdfd1d0ed5e45e72f45f6495591878b0eaa7f77b2503e96ab",
- "sha256:ba5978a97e3366983c8b4ea11f2ae8e1add995ab4789e0098dd2403199999ac4"
+ "sha256:7cb075b56bac894551304cb824f958069a84e0dd2d0a685f9bed3c05e15727bf",
+ "sha256:acae9bfae0aec1b92025bd63e18135ebb4994c84600716c5323e14cb0c9a0b03",
+ "sha256:eec4ab073ff979c34e928e532d8acc1d40d61ba4404709cf27ab3ecdcfa2a561"
],
"index": "ia",
- "version": "==5.0.10"
+ "version": "==6.0.2"
},
- "pathspec": {
+ "perfstat": {
"hashes": [
- "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0",
- "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061"
+ "sha256:4f91fab9be6076972c66fe818eed488be28f1044009237adccce42ff2c7861f5"
],
- "version": "==0.8.0"
+ "version": "==0.1.0.1"
},
"pillow": {
"hashes": [
@@ -451,22 +683,22 @@
},
"psycopg2": {
"hashes": [
- "sha256:132efc7ee46a763e68a815f4d26223d9c679953cd190f1f218187cb60decf535",
- "sha256:2327bf42c1744a434ed8ed0bbaa9168cac7ee5a22a9001f6fc85c33b8a4a14b7",
- "sha256:27c633f2d5db0fc27b51f1b08f410715b59fa3802987aec91aeb8f562724e95c",
- "sha256:2c0afb40cfb4d53487ee2ebe128649028c9a78d2476d14a67781e45dc287f080",
- "sha256:2df2bf1b87305bd95eb3ac666ee1f00a9c83d10927b8144e8e39644218f4cf81",
- "sha256:440a3ea2c955e89321a138eb7582aa1d22fe286c7d65e26a2c5411af0a88ae72",
- "sha256:6a471d4d2a6f14c97a882e8d3124869bc623f3df6177eefe02994ea41fd45b52",
- "sha256:6b306dae53ec7f4f67a10942cf8ac85de930ea90e9903e2df4001f69b7833f7e",
- "sha256:a0984ff49e176062fcdc8a5a2a670c9bb1704a2f69548bce8f8a7bad41c661bf",
- "sha256:ac5b23d0199c012ad91ed1bbb971b7666da651c6371529b1be8cbe2a7bf3c3a9",
- "sha256:acf56d564e443e3dea152efe972b1434058244298a94348fc518d6dd6a9fb0bb",
- "sha256:d3b29d717d39d3580efd760a9a46a7418408acebbb784717c90d708c9ed5f055",
- "sha256:f7d46240f7a1ae1dd95aab38bd74f7428d46531f69219954266d669da60c0818"
+ "sha256:093e3894d2d3c592ab0945d9eba9d139c139664dcf83a1c440b8a7aa9bb21955",
+ "sha256:190d51e8c1b25a47484e52a79638a8182451d6f6dff99f26ad9bd81e5359a0fa",
+ "sha256:1a5c7d7d577e0eabfcf15eb87d1e19314c8c4f0e722a301f98e0e3a65e238b4e",
+ "sha256:1e5a38aa85bd660c53947bd28aeaafb6a97d70423606f1ccb044a03a1203fe4a",
+ "sha256:322fd5fca0b1113677089d4ebd5222c964b1760e361f151cbb2706c4912112c5",
+ "sha256:4cb9936316d88bfab614666eb9e32995e794ed0f8f6b3b718666c22819c1d7ee",
+ "sha256:920bf418000dd17669d2904472efeab2b20546efd0548139618f8fa305d1d7ad",
+ "sha256:922cc5f0b98a5f2b1ff481f5551b95cd04580fd6f0c72d9b22e6c0145a4840e0",
+ "sha256:a5246d2e683a972e2187a8714b5c2cf8156c064629f9a9b1a873c1730d9e245a",
+ "sha256:b9ac1b0d8ecc49e05e4e182694f418d27f3aedcfca854ebd6c05bb1cffa10d6d",
+ "sha256:d3ef67e630b0de0779c42912fe2cbae3805ebaba30cda27fea2a3de650a9414f",
+ "sha256:f5b6320dbc3cf6cfb9f25308286f9f7ab464e65cfb105b64cc9c52831748ced2",
+ "sha256:fc04dd5189b90d825509caa510f20d1d504761e78b8dfb95a0ede180f71d50e5"
],
"index": "ia",
- "version": "==2.8.5"
+ "version": "==2.9.5"
},
"publicsuffix": {
"hashes": [
@@ -474,125 +706,396 @@
],
"version": "==1.1.1"
},
- "pylru": {
- "hashes": [
- "sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
+ "pydantic": {
+ "hashes": [
+ "sha256:05e00dbebbe810b33c7a7362f231893183bcc4251f3f2ff991c31d5c08240c42",
+ "sha256:06094d18dd5e6f2bbf93efa54991c3240964bb663b87729ac340eb5014310624",
+ "sha256:0b959f4d8211fc964772b595ebb25f7652da3f22322c007b6fed26846a40685e",
+ "sha256:19b3b9ccf97af2b7519c42032441a891a5e05c68368f40865a90eb88833c2559",
+ "sha256:1b6ee725bd6e83ec78b1aa32c5b1fa67a3a65badddde3976bca5fe4568f27709",
+ "sha256:1ee433e274268a4b0c8fde7ad9d58ecba12b069a033ecc4645bb6303c062d2e9",
+ "sha256:216f3bcbf19c726b1cc22b099dd409aa371f55c08800bcea4c44c8f74b73478d",
+ "sha256:2d0567e60eb01bccda3a4df01df677adf6b437958d35c12a3ac3e0f078b0ee52",
+ "sha256:2e05aed07fa02231dbf03d0adb1be1d79cabb09025dd45aa094aa8b4e7b9dcda",
+ "sha256:352aedb1d71b8b0736c6d56ad2bd34c6982720644b0624462059ab29bd6e5912",
+ "sha256:355639d9afc76bcb9b0c3000ddcd08472ae75318a6eb67a15866b87e2efa168c",
+ "sha256:37c90345ec7dd2f1bcef82ce49b6235b40f282b94d3eec47e801baf864d15525",
+ "sha256:4b8795290deaae348c4eba0cebb196e1c6b98bdbe7f50b2d0d9a4a99716342fe",
+ "sha256:5760e164b807a48a8f25f8aa1a6d857e6ce62e7ec83ea5d5c5a802eac81bad41",
+ "sha256:6eb843dcc411b6a2237a694f5e1d649fc66c6064d02b204a7e9d194dff81eb4b",
+ "sha256:7b5ba54d026c2bd2cb769d3468885f23f43710f651688e91f5fb1edcf0ee9283",
+ "sha256:7c2abc4393dea97a4ccbb4ec7d8658d4e22c4765b7b9b9445588f16c71ad9965",
+ "sha256:81a7b66c3f499108b448f3f004801fcd7d7165fb4200acb03f1c2402da73ce4c",
+ "sha256:91b8e218852ef6007c2b98cd861601c6a09f1aa32bbbb74fab5b1c33d4a1e410",
+ "sha256:9300fcbebf85f6339a02c6994b2eb3ff1b9c8c14f502058b5bf349d42447dcf5",
+ "sha256:9cabf4a7f05a776e7793e72793cd92cc865ea0e83a819f9ae4ecccb1b8aa6116",
+ "sha256:a1f5a63a6dfe19d719b1b6e6106561869d2efaca6167f84f5ab9347887d78b98",
+ "sha256:a4c805731c33a8db4b6ace45ce440c4ef5336e712508b4d9e1aafa617dc9907f",
+ "sha256:ae544c47bec47a86bc7d350f965d8b15540e27e5aa4f55170ac6a75e5f73b644",
+ "sha256:b97890e56a694486f772d36efd2ba31612739bc6f3caeee50e9e7e3ebd2fdd13",
+ "sha256:bb6ad4489af1bac6955d38ebcb95079a836af31e4c4f74aba1ca05bb9f6027bd",
+ "sha256:bedf309630209e78582ffacda64a21f96f3ed2e51fbf3962d4d488e503420254",
+ "sha256:c1ba1afb396148bbc70e9eaa8c06c1716fdddabaf86e7027c5988bae2a829ab6",
+ "sha256:c33602f93bfb67779f9c507e4d69451664524389546bacfe1bee13cae6dc7488",
+ "sha256:c4aac8e7103bf598373208f6299fa9a5cfd1fc571f2d40bf1dd1955a63d6eeb5",
+ "sha256:c6f981882aea41e021f72779ce2a4e87267458cc4d39ea990729e21ef18f0f8c",
+ "sha256:cc78cc83110d2f275ec1970e7a831f4e371ee92405332ebfe9860a715f8336e1",
+ "sha256:d49f3db871575e0426b12e2f32fdb25e579dea16486a26e5a0474af87cb1ab0a",
+ "sha256:dd3f9a40c16daf323cf913593083698caee97df2804aa36c4b3175d5ac1b92a2",
+ "sha256:e0bedafe4bc165ad0a56ac0bd7695df25c50f76961da29c050712596cf092d6d",
+ "sha256:e9069e1b01525a96e6ff49e25876d90d5a563bc31c658289a8772ae186552236"
],
- "version": "==1.2.0"
+ "index": "ia",
+ "version": "==1.10.2"
},
- "pymysql": {
+ "pylru": {
"hashes": [
- "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a",
- "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"
+ "sha256:47ad140a63ab9389648dadfbb4330700e0ffeeb28ec04664ee47d37ed133b0f4",
+ "sha256:b7c75b0676e2fbae647823bc209e23998772867d3679f1583c7350a9b02a59f0"
+ ],
+ "version": "==1.2.1"
+ },
+ "pymupdf": {
+ "hashes": [
+ "sha256:05c54acf69ee55ef97453f9c52982ef2839c188fe464d6b4cdc053bd4c6298f1",
+ "sha256:11b913664c059146e512e8559ebd9f976570ef21c0338c953836bc02051c1d7e",
+ "sha256:13ed689e5ad4c3adecb7586050de8baaa1819f48e2c57ca4e87f80e3b2727cb3",
+ "sha256:164dc67f1f5db3b22207b2aeba0fadff0503123c8f31c46768b7da7d3595a181",
+ "sha256:1e7b85e2611a9cca7a410e4c5a510a11131de7c5da9379e46615a8d3adfa6df5",
+ "sha256:38188f88a6e648b9f3a87d29de5b4ed52f910827a15859b183f1321c68e6ac00",
+ "sha256:39192c009afd8dd877a79ed02519ec8d17699bec9e9543115e490f06a553e200",
+ "sha256:4c5e7211b85e13050ac6e25879d4f0476b7a04f23bd3b6442489cec9f8da8418",
+ "sha256:7281324a0325dd30c033644cc8654167dcbfe47c4b1d49805d407fa5a64ce76b",
+ "sha256:909fb46900e7422515291761a1294902cf163226ec8918ea4c3454537336dfeb",
+ "sha256:945529b7868f9fe290b11dfbc37e2b9012610fac9763686ccf91a4d968305c5e",
+ "sha256:976fb0e93f025617890f8f8d8517371684131aa0e9fc0c1d0b4cd8bd564cce27",
+ "sha256:9998f7dfa0f99d6c2c3eb0dcfbfd44433247c23c4b781bc45f76dab421bc554b",
+ "sha256:a3b8e5c2de6192c89f379283aa07aa7fd044098dab43a8cd3ac172e961caf286",
+ "sha256:b0db8c81b6c781e373ed005f7595e49b760f91edb3b36d1dc69ec29b4fad34f8",
+ "sha256:c03004415a6d140b2c4bb494bb507c9ccbd55d713407e3b5bc1dd35fa45f2be0",
+ "sha256:cfd6c666b02a066e9e76d9ce8ca5e7fa4f2bf7a8ce6934cd2837b08509d46f8e",
+ "sha256:dffe67c5574d0ebb1e39b5ecf806fb4fd4ddb01bee5630f516ece4468252c9f0",
+ "sha256:ef3d13e27f1585d776f6a2597f113aabd28d36b648b983a72850b21c5399ab08",
+ "sha256:f04086036d40af50e5d6f54e949fa12eacda2d752562a2f85215763b137bf864",
+ "sha256:f3f96bd465e9e0e2960bb70e92233af0865181b9dd8ac5bc6b159d79584df2fe"
],
- "version": "==0.9.3"
+ "index": "ia",
+ "version": "==1.19.6"
},
"python-dateutil": {
"hashes": [
- "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
- "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
+ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+ "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
- "version": "==2.8.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==2.8.2"
},
"python-magic": {
"hashes": [
- "sha256:356efa93c8899047d1eb7d3eb91e871ba2f5b1376edbaf4cc305e3c872207355",
- "sha256:b757db2a5289ea3f1ced9e60f072965243ea43a2221430048fd8cacab17be0ce"
+ "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b",
+ "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"
],
"index": "ia",
- "version": "==0.4.18"
+ "version": "==0.4.27"
},
"python-poppler": {
"hashes": [
- "sha256:ea1f4ce962bf0278f78414c9516ba1ab626b6ade3c2356cab61d853a5d2441b7"
+ "sha256:8b6a157e51cbb4c08353a21ca3f6f396558759cdfb0b80071379ad89d5f7c533"
],
"index": "ia",
- "version": "==0.2.1"
+ "version": "==0.3.0"
},
"python-snappy": {
"hashes": [
- "sha256:9c0ba725755b749ef9b03f6ed7582cefb957c0d9f6f064a7c4314148a9dbdb61",
- "sha256:a745b3732750e2e627adf45fe2669b18afb4170431b0d100da041f807bdea0c8",
- "sha256:ac48ec6146d71627bba0fe4857984ac1f3f70a35c12eed0f91b46f353952d5fa",
- "sha256:b08db966a9c041220b1b602a2e36498dc0755b46b0d8b119f568de71804b9aed",
- "sha256:d9c26532cfa510f45e8d135cde140e8a5603d3fb254cfec273ebc0ecf9f668e2",
- "sha256:f21e8472a7f11b65b4bb5aea1c12624e2d4199aa586c57a11faa0de86a3053a6",
- "sha256:f8bbf1e04d0ec722a7f2e16f2c179f5ada4cfc0ac1196703225894303b061dbb"
+ "sha256:03bb511380fca2a13325b6f16fe8234c8e12da9660f0258cd45d9a02ffc916af",
+ "sha256:0bdb6942180660bda7f7d01f4c0def3cfc72b1c6d99aad964801775a3e379aba",
+ "sha256:0d489b50f49433494160c45048fe806de6b3aeab0586e497ebd22a0bab56e427",
+ "sha256:1a993dc8aadd901915a510fe6af5f20ae4256f527040066c22a154db8946751f",
+ "sha256:1d029f7051ec1bbeaa3e03030b6d8ed47ceb69cae9016f493c802a08af54e026",
+ "sha256:277757d5dad4e239dc1417438a0871b65b1b155beb108888e7438c27ffc6a8cc",
+ "sha256:2a7e528ab6e09c0d67dcb61a1730a292683e5ff9bb088950638d3170cf2a0a54",
+ "sha256:2aaaf618c68d8c9daebc23a20436bd01b09ee70d7fbf7072b7f38b06d2fab539",
+ "sha256:2be4f4550acd484912441f5f1209ba611ac399aac9355fee73611b9a0d4f949c",
+ "sha256:39692bedbe0b717001a99915ac0eb2d9d0bad546440d392a2042b96d813eede1",
+ "sha256:3fb9a88a4dd6336488f3de67ce75816d0d796dce53c2c6e4d70e0b565633c7fd",
+ "sha256:4038019b1bcaadde726a57430718394076c5a21545ebc5badad2c045a09546cf",
+ "sha256:463fd340a499d47b26ca42d2f36a639188738f6e2098c6dbf80aef0e60f461e1",
+ "sha256:4d3cafdf454354a621c8ab7408e45aa4e9d5c0b943b61ff4815f71ca6bdf0130",
+ "sha256:4ec533a8c1f8df797bded662ec3e494d225b37855bb63eb0d75464a07947477c",
+ "sha256:530bfb9efebcc1aab8bb4ebcbd92b54477eed11f6cf499355e882970a6d3aa7d",
+ "sha256:546c1a7470ecbf6239101e9aff0f709b68ca0f0268b34d9023019a55baa1f7c6",
+ "sha256:5843feb914796b1f0405ccf31ea0fb51034ceb65a7588edfd5a8250cb369e3b2",
+ "sha256:586724a0276d7a6083a17259d0b51622e492289a9998848a1b01b6441ca12b2f",
+ "sha256:59e975be4206cc54d0a112ef72fa3970a57c2b1bcc2c97ed41d6df0ebe518228",
+ "sha256:5a453c45178d7864c1bdd6bfe0ee3ed2883f63b9ba2c9bb967c6b586bf763f96",
+ "sha256:5bb05c28298803a74add08ba496879242ef159c75bc86a5406fac0ffc7dd021b",
+ "sha256:5e973e637112391f05581f427659c05b30b6843bc522a65be35ac7b18ce3dedd",
+ "sha256:66c80e9b366012dbee262bb1869e4fc5ba8786cda85928481528bc4a72ec2ee8",
+ "sha256:6a7620404da966f637b9ce8d4d3d543d363223f7a12452a575189c5355fc2d25",
+ "sha256:6f8bf4708a11b47517baf962f9a02196478bbb10fdb9582add4aa1459fa82380",
+ "sha256:735cd4528c55dbe4516d6d2b403331a99fc304f8feded8ae887cf97b67d589bb",
+ "sha256:7778c224efc38a40d274da4eb82a04cac27aae20012372a7db3c4bbd8926c4d4",
+ "sha256:8277d1f6282463c40761f802b742f833f9f2449fcdbb20a96579aa05c8feb614",
+ "sha256:88b6ea78b83d2796f330b0af1b70cdd3965dbdab02d8ac293260ec2c8fe340ee",
+ "sha256:8c07220408d3268e8268c9351c5c08041bc6f8c6172e59d398b71020df108541",
+ "sha256:8d0c019ee7dcf2c60e240877107cddbd95a5b1081787579bf179938392d66480",
+ "sha256:90b0186516b7a101c14764b0c25931b741fb0102f21253eff67847b4742dfc72",
+ "sha256:9837ac1650cc68d22a3cf5f15fb62c6964747d16cecc8b22431f113d6e39555d",
+ "sha256:9eac51307c6a1a38d5f86ebabc26a889fddf20cbba7a116ccb54ba1446601d5b",
+ "sha256:9f0c0d88b84259f93c3aa46398680646f2c23e43394779758d9f739c34e15295",
+ "sha256:a0ad38bc98d0b0497a0b0dbc29409bcabfcecff4511ed7063403c86de16927bc",
+ "sha256:b265cde49774752aec9ca7f5d272e3f98718164afc85521622a8a5394158a2b5",
+ "sha256:b6a107ab06206acc5359d4c5632bd9b22d448702a79b3169b0c62e0fb808bb2a",
+ "sha256:b7f920eaf46ebf41bd26f9df51c160d40f9e00b7b48471c3438cb8d027f7fb9b",
+ "sha256:c20498bd712b6e31a4402e1d027a1cd64f6a4a0066a3fe3c7344475886d07fdf",
+ "sha256:cb18d9cd7b3f35a2f5af47bb8ed6a5bdbf4f3ddee37f3daade4ab7864c292f5b",
+ "sha256:cf5bb9254e1c38aacf253d510d3d9be631bba21f3d068b17672b38b5cbf2fff5",
+ "sha256:d017775851a778ec9cc32651c4464079d06d927303c2dde9ae9830ccf6fe94e1",
+ "sha256:dc96668d9c7cc656609764275c5f8da58ef56d89bdd6810f6923d36497468ff7",
+ "sha256:e066a0586833d610c4bbddba0be5ba0e3e4f8e0bc5bb6d82103d8f8fc47bb59a",
+ "sha256:e3a013895c64352b49d0d8e107a84f99631b16dbab156ded33ebf0becf56c8b2",
+ "sha256:eaf905a580f2747c4a474040a5063cd5e0cc3d1d2d6edb65f28196186493ad4a"
],
"index": "ia",
- "version": "==0.5.4"
+ "version": "==0.6.1"
},
"pytz": {
"hashes": [
- "sha256:a494d53b6d39c3c6e44c3bec237336e14305e4f29bbf800b599253057fbb79ed",
- "sha256:c35965d010ce31b23eeb663ed3cc8c906275d6be1a34393a1d73a41febf4a048"
+ "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
+ "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
+ ],
+ "version": "==2022.7"
+ },
+ "pytz-deprecation-shim": {
+ "hashes": [
+ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
+ "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
- "version": "==2020.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==0.1.0.post0"
},
"pyyaml": {
"hashes": [
"sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
"sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
"sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
+ "sha256:6034f55dab5fea9e53f436aa68fa3ace2634918e8b5994d82f3621c04ff5ed2e",
"sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
"sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
"sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
"sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
"sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
+ "sha256:ad9c67312c84def58f3c04504727ca879cb0013b2517c85a9a253f0cb6380c0a",
"sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
"sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
"sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
],
"version": "==5.3.1"
},
- "raven": {
- "hashes": [
- "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
- "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
- ],
- "index": "ia",
- "version": "==6.10.0"
+ "rapidfuzz": {
+ "hashes": [
+ "sha256:020858dd89b60ce38811cd6e37875c4c3c8d7fcd8bc20a0ad2ed1f464b34dc4e",
+ "sha256:042644133244bfa7b20de635d500eb9f46af7097f3d90b1724f94866f17cb55e",
+ "sha256:08590905a95ccfa43f4df353dcc5d28c15d70664299c64abcad8721d89adce4f",
+ "sha256:114810491efb25464016fd554fdf1e20d390309cecef62587494fc474d4b926f",
+ "sha256:1333fb3d603d6b1040e365dca4892ba72c7e896df77a54eae27dc07db90906e3",
+ "sha256:16080c05a63d6042643ae9b6cfec1aefd3e61cef53d0abe0df3069b9d4b72077",
+ "sha256:16ffad751f43ab61001187b3fb4a9447ec2d1aedeff7c5bac86d3b95f9980cc3",
+ "sha256:1f50d1227e6e2a0e3ae1fb1c9a2e1c59577d3051af72c7cab2bcc430cb5e18da",
+ "sha256:1fbad8fb28d98980f5bff33c7842efef0315d42f0cd59082108482a7e6b61410",
+ "sha256:23524635840500ce6f4d25005c9529a97621689c85d2f727c52eed1782839a6a",
+ "sha256:24d3fea10680d085fd0a4d76e581bfb2b1074e66e78fd5964d4559e1fcd2a2d4",
+ "sha256:24eb6b843492bdc63c79ee4b2f104059b7a2201fef17f25177f585d3be03405a",
+ "sha256:25b4cedf2aa19fb7212894ce5f5219010cce611b60350e9a0a4d492122e7b351",
+ "sha256:27be9c63215d302ede7d654142a2e21f0d34ea6acba512a4ae4cfd52bbaa5b59",
+ "sha256:2c836f0f2d33d4614c3fbaf9a1eb5407c0fe23f8876f47fd15b90f78daa64c34",
+ "sha256:3a9bd02e1679c0fd2ecf69b72d0652dbe2a9844eaf04a36ddf4adfbd70010e95",
+ "sha256:3d8b081988d0a49c486e4e845a547565fee7c6e7ad8be57ff29c3d7c14c6894c",
+ "sha256:3dcffe1f3cbda0dc32133a2ae2255526561ca594f15f9644384549037b355245",
+ "sha256:3f11a7eff7bc6301cd6a5d43f309e22a815af07e1f08eeb2182892fca04c86cb",
+ "sha256:42085d4b154a8232767de8296ac39c8af5bccee6b823b0507de35f51c9cbc2d7",
+ "sha256:424f82c35dbe4f83bdc3b490d7d696a1dc6423b3d911460f5493b7ffae999fd2",
+ "sha256:43fb8cb030f888c3f076d40d428ed5eb4331f5dd6cf1796cfa39c67bf0f0fc1e",
+ "sha256:460853983ab88f873173e27cc601c5276d469388e6ad6e08c4fd57b2a86f1064",
+ "sha256:467c1505362823a5af12b10234cb1c4771ccf124c00e3fc9a43696512bd52293",
+ "sha256:46b9b8aa09998bc48dd800854e8d9b74bc534d7922c1d6e1bbf783e7fa6ac29c",
+ "sha256:53dcae85956853b787c27c1cb06f18bb450e22cf57a4ad3444cf03b8ff31724a",
+ "sha256:585206112c294e335d84de5d5f179c0f932837752d7420e3de21db7fdc476278",
+ "sha256:5ada0a14c67452358c1ee52ad14b80517a87b944897aaec3e875279371a9cb96",
+ "sha256:5e2b3d020219baa75f82a4e24b7c8adcb598c62f0e54e763c39361a9e5bad510",
+ "sha256:6120f2995f5154057454c5de99d86b4ef3b38397899b5da1265467e8980b2f60",
+ "sha256:68a89bb06d5a331511961f4d3fa7606f8e21237467ba9997cae6f67a1c2c2b9e",
+ "sha256:7496e8779905b02abc0ab4ba2a848e802ab99a6e20756ffc967a0de4900bd3da",
+ "sha256:759a3361711586a29bc753d3d1bdb862983bd9b9f37fbd7f6216c24f7c972554",
+ "sha256:75c45dcd595f8178412367e302fd022860ea025dc4a78b197b35428081ed33d5",
+ "sha256:7d005e058d86f2a968a8d28ca6f2052fab1f124a39035aa0523261d6baf21e1f",
+ "sha256:7f7930adf84301797c3f09c94b9c5a9ed90a9e8b8ed19b41d2384937e0f9f5bd",
+ "sha256:8109e0324d21993d5b2d111742bf5958f3516bf8c59f297c5d1cc25a2342eb66",
+ "sha256:81642a24798851b118f82884205fc1bd9ff70b655c04018c467824b6ecc1fabc",
+ "sha256:8450d15f7765482e86ef9be2ad1a05683cd826f59ad236ef7b9fb606464a56aa",
+ "sha256:875d51b3497439a72e2d76183e1cb5468f3f979ab2ddfc1d1f7dde3b1ecfb42f",
+ "sha256:8b477b43ced896301665183a5e0faec0f5aea2373005648da8bdcb3c4b73f280",
+ "sha256:8d3e252d4127c79b4d7c2ae47271636cbaca905c8bb46d80c7930ab906cf4b5c",
+ "sha256:916bc2e6cf492c77ad6deb7bcd088f0ce9c607aaeabc543edeb703e1fbc43e31",
+ "sha256:988f8f6abfba7ee79449f8b50687c174733b079521c3cc121d65ad2d38831846",
+ "sha256:99a84ab9ac9a823e7e93b4414f86344052a5f3e23b23aa365cda01393ad895bd",
+ "sha256:9be02162af0376d64b840f2fc8ee3366794fc149f1e06d095a6a1d42447d97c5",
+ "sha256:a5585189b3d90d81ccd62d4f18530d5ac8972021f0aaaa1ffc6af387ff1dce75",
+ "sha256:ae33a72336059213996fe4baca4e0e4860913905c2efb7c991eab33b95a98a0a",
+ "sha256:af4f7c3c904ca709493eb66ca9080b44190c38e9ecb3b48b96d38825d5672559",
+ "sha256:b20141fa6cee041917801de0bab503447196d372d4c7ee9a03721b0a8edf5337",
+ "sha256:b3210869161a864f3831635bb13d24f4708c0aa7208ef5baac1ac4d46e9b4208",
+ "sha256:b34e8c0e492949ecdd5da46a1cfc856a342e2f0389b379b1a45a3cdcd3176a6e",
+ "sha256:b52ac2626945cd21a2487aeefed794c14ee31514c8ae69b7599170418211e6f6",
+ "sha256:b5dd713a1734574c2850c566ac4286594bacbc2d60b9170b795bee4b68656625",
+ "sha256:b5f705652360d520c2de52bee11100c92f59b3e3daca308ebb150cbc58aecdad",
+ "sha256:b6389c50d8d214c9cd11a77f6d501529cb23279a9c9cafe519a3a4b503b5f72a",
+ "sha256:b6bad92de071cbffa2acd4239c1779f66851b60ffbbda0e4f4e8a2e9b17e7eef",
+ "sha256:b75dd0928ce8e216f88660ab3d5c5ffe990f4dd682fd1709dba29d5dafdde6de",
+ "sha256:c2523f8180ebd9796c18d809e9a19075a1060b1a170fde3799e83db940c1b6d5",
+ "sha256:c31022d9970177f6affc6d5dd757ed22e44a10890212032fabab903fdee3bfe7",
+ "sha256:c36fd260084bb636b9400bb92016c6bd81fd80e59ed47f2466f85eda1fc9f782",
+ "sha256:c3741cb0bf9794783028e8b0cf23dab917fa5e37a6093b94c4c2f805f8e36b9f",
+ "sha256:c3fbe449d869ea4d0909fc9d862007fb39a584fb0b73349a6aab336f0d90eaed",
+ "sha256:c66546e30addb04a16cd864f10f5821272a1bfe6462ee5605613b4f1cb6f7b48",
+ "sha256:c71d9d512b76f05fa00282227c2ae884abb60e09f08b5ca3132b7e7431ac7f0d",
+ "sha256:c8601a66fbfc0052bb7860d2eacd303fcde3c14e87fdde409eceff516d659e77",
+ "sha256:c88adbcb933f6b8612f6c593384bf824e562bb35fc8a0f55fac690ab5b3486e5",
+ "sha256:ca00fafd2756bc9649bf80f1cf72c647dce38635f0695d7ce804bc0f759aa756",
+ "sha256:ca8a23097c1f50e0fdb4de9e427537ca122a18df2eead06ed39c3a0bef6d9d3a",
+ "sha256:cda1e2f66bb4ba7261a0f4c2d052d5d909798fca557cbff68f8a79a87d66a18f",
+ "sha256:cdfc04f7647c29fb48da7a04082c34cdb16f878d3c6d098d62d5715c0ad3000c",
+ "sha256:cf62dacb3f9234f3fddd74e178e6d25c68f2067fde765f1d95f87b1381248f58",
+ "sha256:d00df2e4a81ffa56a6b1ec4d2bc29afdcb7f565e0b8cd3092fece2290c4c7a79",
+ "sha256:d248a109699ce9992304e79c1f8735c82cc4c1386cd8e27027329c0549f248a2",
+ "sha256:d63def9bbc6b35aef4d76dc740301a4185867e8870cbb8719ec9de672212fca8",
+ "sha256:d82f20c0060ffdaadaf642b88ab0aa52365b56dffae812e188e5bdb998043588",
+ "sha256:dbcf5371ea704759fcce772c66a07647751d1f5dbdec7818331c9b31ae996c77",
+ "sha256:e8914dad106dacb0775718e54bf15e528055c4e92fb2677842996f2d52da5069",
+ "sha256:ebe303cd9839af69dd1f7942acaa80b1ba90bacef2e7ded9347fbed4f1654672",
+ "sha256:ec55a81ac2b0f41b8d6fb29aad16e55417036c7563bad5568686931aa4ff08f7",
+ "sha256:effe182767d102cb65dfbbf74192237dbd22d4191928d59415aa7d7c861d8c88",
+ "sha256:f42b82f268689f429def9ecfb86fa65ceea0eaf3fed408b570fe113311bf5ce7",
+ "sha256:f6fe570e20e293eb50491ae14ddeef71a6a7e5f59d7e791393ffa99b13f1f8c2",
+ "sha256:f799d1d6c33d81e983d3682571cc7d993ae7ff772c19b3aabb767039c33f6d1e",
+ "sha256:f891b98f8bc6c9d521785816085e9657212621e93f223917fb8e32f318b2957e",
+ "sha256:fa263135b892686e11d5b84f6a1892523123a00b7e5882eff4fbdabb38667347",
+ "sha256:fa4c598ed77f74ec973247ca776341200b0f93ec3883e34c222907ce72cb92a4",
+ "sha256:fe56659ccadbee97908132135de4b875543353351e0c92e736b7c57aee298b5a",
+ "sha256:fe59a0c21a032024edb0c8e43f5dee5623fef0b65a1e3c1281836d9ce199af3b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.13.7"
},
"redis": {
"hashes": [
- "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
- "sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"
+ "sha256:7b8c87d19c45d3f1271b124858d2a5c13160c4e74d4835e28273400fa34d5228",
+ "sha256:cae3ee5d1f57d8caf534cd8764edf3163c77e073bdd74b6f54a87ffafdc5e7d9"
],
- "version": "==3.5.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"regex": {
"hashes": [
- "sha256:08997a37b221a3e27d68ffb601e45abfb0093d39ee770e4257bd2f5115e8cb0a",
- "sha256:112e34adf95e45158c597feea65d06a8124898bdeac975c9087fe71b572bd938",
- "sha256:1700419d8a18c26ff396b3b06ace315b5f2a6e780dad387e4c48717a12a22c29",
- "sha256:2f6f211633ee8d3f7706953e9d3edc7ce63a1d6aad0be5dcee1ece127eea13ae",
- "sha256:52e1b4bef02f4040b2fd547357a170fc1146e60ab310cdbdd098db86e929b387",
- "sha256:55b4c25cbb3b29f8d5e63aeed27b49fa0f8476b0d4e1b3171d85db891938cc3a",
- "sha256:5aaa5928b039ae440d775acea11d01e42ff26e1561c0ffcd3d805750973c6baf",
- "sha256:654cb773b2792e50151f0e22be0f2b6e1c3a04c5328ff1d9d59c0398d37ef610",
- "sha256:690f858d9a94d903cf5cada62ce069b5d93b313d7d05456dbcd99420856562d9",
- "sha256:6ad8663c17db4c5ef438141f99e291c4d4edfeaacc0ce28b5bba2b0bf273d9b5",
- "sha256:89cda1a5d3e33ec9e231ece7307afc101b5217523d55ef4dc7fb2abd6de71ba3",
- "sha256:92d8a043a4241a710c1cf7593f5577fbb832cf6c3a00ff3fc1ff2052aff5dd89",
- "sha256:95fa7726d073c87141f7bbfb04c284901f8328e2d430eeb71b8ffdd5742a5ded",
- "sha256:97712e0d0af05febd8ab63d2ef0ab2d0cd9deddf4476f7aa153f76feef4b2754",
- "sha256:b2ba0f78b3ef375114856cbdaa30559914d081c416b431f2437f83ce4f8b7f2f",
- "sha256:bae83f2a56ab30d5353b47f9b2a33e4aac4de9401fb582b55c42b132a8ac3868",
- "sha256:c78e66a922de1c95a208e4ec02e2e5cf0bb83a36ceececc10a72841e53fbf2bd",
- "sha256:cf59bbf282b627130f5ba68b7fa3abdb96372b24b66bdf72a4920e8153fc7910",
- "sha256:e3cdc9423808f7e1bb9c2e0bdb1c9dc37b0607b30d646ff6faf0d4e41ee8fee3",
- "sha256:e9b64e609d37438f7d6e68c2546d2cb8062f3adb27e6336bc129b51be20773ac",
- "sha256:fbff901c54c22425a5b809b914a3bfaf4b9570eee0e5ce8186ac71eb2025191c"
- ],
- "version": "==2020.6.8"
+ "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad",
+ "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4",
+ "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd",
+ "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc",
+ "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d",
+ "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066",
+ "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec",
+ "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9",
+ "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e",
+ "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8",
+ "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e",
+ "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783",
+ "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6",
+ "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1",
+ "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c",
+ "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4",
+ "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1",
+ "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1",
+ "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7",
+ "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8",
+ "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe",
+ "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d",
+ "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b",
+ "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8",
+ "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c",
+ "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af",
+ "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49",
+ "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714",
+ "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542",
+ "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318",
+ "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e",
+ "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5",
+ "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc",
+ "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144",
+ "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453",
+ "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5",
+ "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61",
+ "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11",
+ "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a",
+ "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54",
+ "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73",
+ "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc",
+ "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347",
+ "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c",
+ "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66",
+ "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c",
+ "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93",
+ "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443",
+ "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc",
+ "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1",
+ "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892",
+ "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8",
+ "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001",
+ "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa",
+ "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90",
+ "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c",
+ "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0",
+ "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692",
+ "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4",
+ "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5",
+ "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690",
+ "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83",
+ "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66",
+ "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f",
+ "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f",
+ "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4",
+ "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee",
+ "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81",
+ "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95",
+ "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9",
+ "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff",
+ "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e",
+ "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5",
+ "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6",
+ "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7",
+ "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1",
+ "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394",
+ "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6",
+ "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742",
+ "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57",
+ "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b",
+ "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7",
+ "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b",
+ "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244",
+ "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af",
+ "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185",
+ "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8",
+ "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.10.31"
},
"requests": {
"hashes": [
- "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b",
- "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.24.0"
+ "version": "==2.28.1"
},
"requests-file": {
"hashes": [
@@ -609,138 +1112,189 @@
},
"s3transfer": {
"hashes": [
- "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13",
- "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db"
+ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
+ "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
- "version": "==0.3.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.6.0"
},
"schedule": {
"hashes": [
- "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
- "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ "sha256:617adce8b4bf38c360b781297d59918fbebfb2878f1671d189f4f4af5d0567a4",
+ "sha256:e6ca13585e62c810e13a08682e0a6a8ad245372e376ba2b8679294f377dfc8e4"
],
- "version": "==0.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.1.0"
},
"schema": {
"hashes": [
- "sha256:3a03c2e2b22e6a331ae73750ab1da46916da6ca861b16e6f073ac1d1eba43b71",
- "sha256:b536f2375b49fdf56f36279addae98bd86a8afbd58b3c32ce363c464bed5fc1c"
+ "sha256:f06717112c61895cabc4707752b88716e8420a8819d71404501e114f91043197",
+ "sha256:f3ffdeeada09ec34bf40d7d79996d9f7175db93b7a5065de0faa7f41083c1e6c"
+ ],
+ "version": "==0.7.5"
+ },
+ "selectolax": {
+ "hashes": [
+ "sha256:010b008aca04be6cf9727d6f206a583d79a82d397126a101f57f117113a082bb",
+ "sha256:0878aa1ab3906831b20ad9e316a77c8401030dd388f3c1c72ba51bc08d497584",
+ "sha256:087e663c0ba6d9d79294508b0a3145079e838950a0e2fc7b8b1485da3fe24254",
+ "sha256:0a8dddd34dea642429629aae21cf940668eaa1c66ab0bcf9970d72f38676697d",
+ "sha256:14c9368f9dd224f895ef1431b1961d6e9a56fb26a95b5c04900def7b8961744c",
+ "sha256:17ac0b2b4222ba2c16852c0035dcd31d9e100544e6a5138f6e01f6b1648691b5",
+ "sha256:1ba1cd707a0d0090cffb2851ec6ccfdc334ed0c2ea08ae8705a9f6c97a997f77",
+ "sha256:1d38157e2358dacf55e782d332b41391821b2ef237e34e47ff276b2184c96542",
+ "sha256:1f1ec20cc75e1866f7758e543907da222c5d8072e580cf6814f2f142036c695f",
+ "sha256:1fa1737b7031b467d8613919503c85482a59c65ac91fe60074180e625e2533c6",
+ "sha256:221051ffe8c2950e9ebe41e08103397a7b287dca05a9e8084bb9e925f2d9c556",
+ "sha256:264918c1e9e6f6657f47116e4dbd74b57c660d3e86f9cc78209f132c56c8e9e5",
+ "sha256:2d8c7ce06bdf83d3cd2a617211eec48c875826bae54c74e56aec2635daac2f31",
+ "sha256:31fb0fbc88674b3346e379664c5837070e79b2f65eab3e29b7c43e1b4fc1137c",
+ "sha256:3600747c5072725580f8dc249a40ae123840f22edab950f43b349d356f44268b",
+ "sha256:3d65d0c57cfa1b05beb5c72d3cb566f4fdaf16e5112082f300cfa6bd94836aff",
+ "sha256:3daaf7ec54565d3f15f9ce046f6a8e469d966dc4fc879af8c7f753d37994f70e",
+ "sha256:418738a2f46beea2444a1587adb4f509bdd8e7ddffac071dba097c1a3ddb8cfc",
+ "sha256:46776ca482a76b3f522e4d8f90474716e4da51dc2823f3ecc6a2ff38ef0663b7",
+ "sha256:46bacca9e9f077ff2c5a973c05b8862425f077c58f2dca8059b992ceaca6b6de",
+ "sha256:4c5c68f0139d0928298ef5e95137996e0efb6f8db364b1470221e8710834a0ab",
+ "sha256:51c33d33e4e4eec0d9c1b6accdda5c93f4e3a00b28e99fc4ebb2b95d1d4ef885",
+ "sha256:585a75f4aff85b48d0fc8f3e9afbd1e2c05902a332982d04bab93e8e1db2e4a4",
+ "sha256:5acbe02c26b43428c2f49e8f09a81bd47be7ea969c6798cde1a23c2b33d25c79",
+ "sha256:6111ac9e5ca02b13d8e3057c1e20d6608435c64a11f92460a59951a7209c2cf3",
+ "sha256:67c32c29bc9011ed1b6fd67a961073e69d67bf60bf09f3db54d6240c034719f4",
+ "sha256:68c42af2cabecf04528dff2d0bbebbecfbafc394a5192b6a5b3e1dcd19eeb766",
+ "sha256:709b1680a16f210c43e4f3240dfc15e3312ccd43c9ea20c8e20c81470214cfc6",
+ "sha256:762e91a0ac0caa2d8731568e5b2ad0cec6fc06465a9dd89280118ced4b7e0849",
+ "sha256:7d47e489a8b0181992a3384987c854bd88211685e1c32dcdcb8746ec98dbcf7e",
+ "sha256:7ebe824763782f0e6ad2accd57d0cef3a61922b72be99ccafebe0154e9b8aef6",
+ "sha256:7f1a35be9413bcd56f225b1509740ea8999a6f7558e0f0a50a4ca80b91bf11be",
+ "sha256:81c7847ff0f3561559bd98015aa3fe0a2dfb26966156f7704f7f65339d48e81c",
+ "sha256:9246bf586afaacfdc0e6fb17806ee0d3e1736d3d13a87c8e96214596d50576b7",
+ "sha256:9baff22ae7015e8f2697d5db0804ee379d53fa6e54f1dc7e9f61ee8ccb1bdb2e",
+ "sha256:a4634d7c7e9d2eb65d0fc7fe0d88641eb413cb7250fbfc66b3b4d88d49e4c724",
+ "sha256:a7fa03253260c3351f61cef36865b27ad4585516e9ac4a77244d237bfaf37f13",
+ "sha256:abac4b7afe430dd135f148d4001b593b09c8f64fccd63b15fbb03b77735e3405",
+ "sha256:ad0cfc7f66a2863d199af819c79bfa160bcc830e0f83fd5391cdd80e545af758",
+ "sha256:adabfb5635d00da49bddef3844dc65ca3da81acd889ea7be2a74ef9456558f36",
+ "sha256:ae58e7cc282a768a68abbfa39eff895788a39658c5a235524c21b09d182b3d3a",
+ "sha256:b348074bc3a0e16e9af1a2f57e0da18f5def97e415c6435dadc68aead7ccf060",
+ "sha256:b48e4c8df2c226552ac18636c2ebe9d100ff3daa8742616687bd2cbf74a81e2f",
+ "sha256:c23d9f82aea887347151538a58b15a8dbee4261e4114705c0974dee81eb796e0",
+ "sha256:c2b589be0dd45d62ec43a6446f09919b5be809c708d8ff6a7cb86acd9150091b",
+ "sha256:d13904fc037bcebc6d79e83c0a19e64cc9d4771cd7f27b325c63d1071ec0d0f0",
+ "sha256:d3506e831b972c1eb22538b25e7c991289b72b2e028bd27b633dfbd21c1a511a",
+ "sha256:d809fbf258c28190160b3fe5d34adddb1da44ed7a2f800b7125e0fac6e940016",
+ "sha256:da688ca957d68b8072dc9658506c07326f6332ff3fe03214fec375a4ccc67f8a",
+ "sha256:e001a40b25e478f8390c3898c5852cf9a226668ba02fdc4d8e3a4788ce64207a",
+ "sha256:e805b106edac716047afc6e9e49953242207909bfbb70bf47c53f231e2d27d74",
+ "sha256:eb86cacac6ed203c386afe6704732fb05d831006c65869f15f41d15e9e72973b",
+ "sha256:f5cef3310fc41f71e8fc19d05534d100f6c02789d46041777b0bbd70961a94ec",
+ "sha256:f76b0ad63b55e45d3c02e50ca8b8ef64a500aed9a5f50818173b66949470f8e4",
+ "sha256:fad7fb68e929082e6474e1392dd433d465b06b59e26158ef67813c0c8e5b7f66",
+ "sha256:fb3b3425ee21f5098531ce80dc48d99a555b8b2300deb0ddf84b6bc503f0a848",
+ "sha256:fc53731aa81617694667d4c56d21a9e26df840a219f4b62588af80c6781ba613"
+ ],
+ "index": "ia",
+ "version": "==0.3.11"
+ },
+ "sentry-sdk": {
+ "extras": [],
+ "hashes": [
+ "sha256:5bbe4b72de22f9ac1e67f2a4e6efe8fbd595bb59b7b223443f50fe5802a5551c",
+ "sha256:9f0b960694e2d8bb04db4ba6ac2a645040caef4e762c65937998ff06064f10d6"
],
- "version": "==0.7.2"
+ "index": "ia",
+ "version": "==1.12.1"
},
"six": {
"hashes": [
- "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
- "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
- "version": "==1.15.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
},
"soupsieve": {
"hashes": [
- "sha256:1634eea42ab371d3d346309b93df7870a88610f0725d47528be902a0d95ecc55",
- "sha256:a59dc181727e95d25f781f0eb4fd1825ff45590ec8ff49eadfd7f1a537cc0232"
+ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
+ "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
- "version": "==2.0.1"
- },
- "sqlalchemy": {
- "hashes": [
- "sha256:128bc917ed20d78143a45024455ff0aed7d3b96772eba13d5dbaf9cc57e5c41b",
- "sha256:156a27548ba4e1fed944ff9fcdc150633e61d350d673ae7baaf6c25c04ac1f71",
- "sha256:27e2efc8f77661c9af2681755974205e7462f1ae126f498f4fe12a8b24761d15",
- "sha256:2a12f8be25b9ea3d1d5b165202181f2b7da4b3395289000284e5bb86154ce87c",
- "sha256:31c043d5211aa0e0773821fcc318eb5cbe2ec916dfbc4c6eea0c5188971988eb",
- "sha256:65eb3b03229f684af0cf0ad3bcc771970c1260a82a791a8d07bffb63d8c95bcc",
- "sha256:6cd157ce74a911325e164441ff2d9b4e244659a25b3146310518d83202f15f7a",
- "sha256:703c002277f0fbc3c04d0ae4989a174753a7554b2963c584ce2ec0cddcf2bc53",
- "sha256:869bbb637de58ab0a912b7f20e9192132f9fbc47fc6b5111cd1e0f6cdf5cf9b0",
- "sha256:8a0e0cd21da047ea10267c37caf12add400a92f0620c8bc09e4a6531a765d6d7",
- "sha256:8d01e949a5d22e5c4800d59b50617c56125fc187fbeb8fa423e99858546de616",
- "sha256:925b4fe5e7c03ed76912b75a9a41dfd682d59c0be43bce88d3b27f7f5ba028fb",
- "sha256:9cb1819008f0225a7c066cac8bb0cf90847b2c4a6eb9ebb7431dbd00c56c06c5",
- "sha256:a87d496884f40c94c85a647c385f4fd5887941d2609f71043e2b73f2436d9c65",
- "sha256:a9030cd30caf848a13a192c5e45367e3c6f363726569a56e75dc1151ee26d859",
- "sha256:a9e75e49a0f1583eee0ce93270232b8e7bb4b1edc89cc70b07600d525aef4f43",
- "sha256:b50f45d0e82b4562f59f0e0ca511f65e412f2a97d790eea5f60e34e5f1aabc9a",
- "sha256:b7878e59ec31f12d54b3797689402ee3b5cfcb5598f2ebf26491732758751908",
- "sha256:ce1ddaadee913543ff0154021d31b134551f63428065168e756d90bdc4c686f5",
- "sha256:ce2646e4c0807f3461be0653502bb48c6e91a5171d6e450367082c79e12868bf",
- "sha256:ce6c3d18b2a8ce364013d47b9cad71db815df31d55918403f8db7d890c9d07ae",
- "sha256:e4e2664232005bd306f878b0f167a31f944a07c4de0152c444f8c61bbe3cfb38",
- "sha256:e8aa395482728de8bdcca9cc0faf3765ab483e81e01923aaa736b42f0294f570",
- "sha256:eb4fcf7105bf071c71068c6eee47499ab8d4b8f5a11fc35147c934f0faa60f23",
- "sha256:ed375a79f06cad285166e5be74745df1ed6845c5624aafadec4b7a29c25866ef",
- "sha256:f35248f7e0d63b234a109dd72fbfb4b5cb6cb6840b221d0df0ecbf54ab087654",
- "sha256:f502ef245c492b391e0e23e94cba030ab91722dcc56963c85bfd7f3441ea2bbe",
- "sha256:fe01bac7226499aedf472c62fa3b85b2c619365f3f14dd222ffe4f3aa91e5f98"
- ],
- "version": "==1.3.17"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.3.2.post1"
},
"surt": {
"hashes": [
- "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720",
- "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720"
],
"version": "==0.3.1"
},
- "tldextract": {
+ "tld": {
"hashes": [
- "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7",
- "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808"
+ "sha256:266106ad9035f54cd5cce5f823911a51f697e7c58cb45bfbd6c53b4c2976ece2",
+ "sha256:69fed19d26bb3f715366fb4af66fdeace896c55c052b00e8aaba3a7b63f3e7f0",
+ "sha256:826bbe61dccc8d63144b51caef83e1373fbaac6f9ada46fca7846021f5d36fef",
+ "sha256:843844e4256c943983d86366b5af3ac9cd1c9a0b6465f04d9f70e3b4c1a7989f",
+ "sha256:a92ac6b84917e7d9e934434b8d37e9be534598f138fbb86b3c0d5426f2621890",
+ "sha256:b6650f2d5392a49760064bc55d73ce3397a378ef24ded96efb516c6b8ec68c26",
+ "sha256:ef5b162d6fa295822dacd4fe4df1b62d8df2550795a97399a8905821b58d3702"
],
- "version": "==2.2.2"
+ "markers": "python_version >= '2.7' and python_version < '4'",
+ "version": "==0.12.6"
},
- "toml": {
+ "tldextract": {
"hashes": [
- "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f",
- "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"
+ "sha256:47aa4d8f1a4da79a44529c9a2ddc518663b25d371b805194ec5ce2a5f615ccd2",
+ "sha256:78aef13ac1459d519b457a03f1f74c1bf1c2808122a6bcc0e6840f81ba55ad73"
],
- "version": "==0.10.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.4.0"
},
- "total-ordering": {
+ "tqdm": {
"hashes": [
- "sha256:a14a2a138a52befaa02b3fd53eb3366f66da69020be299af3cf0b54c9441aacc"
+ "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4",
+ "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"
],
- "version": "==0.1.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==4.64.1"
},
- "tqdm": {
+ "trafilatura": {
"hashes": [
- "sha256:07c06493f1403c1380b630ae3dcbe5ae62abcf369a93bbc052502279f189ab8c",
- "sha256:cd140979c2bebd2311dfb14781d8f19bd5a9debb92dcab9f6ef899c987fcf71f"
+ "sha256:a66189e4b9d591dce648f0cc79fb52a486e679708090189bc4fcd88068f095ef",
+ "sha256:c2bc0cbac6248363d938666cbedbb067ad8aefe31667c88038135b93efd475c3"
],
- "version": "==4.46.1"
+ "index": "ia",
+ "version": "==1.3.0"
},
"twitter": {
"hashes": [
- "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
- "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
- ],
- "version": "==1.18.0"
- },
- "typed-ast": {
- "hashes": [
- "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
- "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
- "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
- "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
- "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
- "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
- "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
- "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
- "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
- "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
- "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
- "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
- "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
- "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
- "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
- "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
- "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
- "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
- "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
- "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
- "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
- ],
- "version": "==1.4.1"
+ "sha256:1d9a3e45f2c440f308a7116d3672b0d1981aba8ac41cb7f3ed270ed50693f0e0",
+ "sha256:80ddd69ae2eeb88313feedeea31bf119fd6e79541ee5b37abb9c43d233194e10"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==1.19.6"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
+ },
+ "tzdata": {
+ "hashes": [
+ "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d",
+ "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.7"
+ },
+ "tzlocal": {
+ "hashes": [
+ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
+ "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.2"
},
"urlcanon": {
"hashes": [
@@ -751,11 +1305,11 @@
},
"urllib3": {
"hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.22"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"warctools": {
"hashes": [
@@ -764,29 +1318,26 @@
"version": "==4.10.0"
},
"wayback": {
- "hashes": [
- "sha256:a761515f81d4bcfa543ddb7fbe2b584508212735c438a269e86d4196015b4d6f"
+ "extras": [
+ "brotli"
],
- "index": "ia",
- "version": "==0.6.1"
- },
- "wayback-esp": {
"hashes": [
- "sha256:283c1d38712dbf019ade15f5ffe8cf740951201a6a7cb1b9c98c3e84adb8b2f0"
+ "sha256:3a3f149508d68ec53f5cdf434a45e5bb906beef731327d7bd2ef6b751c98281b"
],
- "version": "==0.2.8"
+ "index": "ia",
+ "version": "==0.8.6.1"
},
"wayback-search-js": {
"hashes": [
- "sha256:ae83f2719b0737d173c0a91ef13e9cfcd4d2f64bca8c00719f1977bbe5f864e2"
+ "sha256:a474ba8da58f9cc27b1dce7f87a8cc7d119715ab4bab750dcc1d90f002074161"
],
- "version": "==2.12.3"
+ "version": "==3.1.21"
},
"wbex-client": {
"hashes": [
- "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
+ "sha256:8c4028d744dda05cca932b411a826f9478a65cbc018784bff9528e973c7f9c36"
],
- "version": "==0.1.5"
+ "version": "==0.1.6.1"
},
"wcwidth": {
"hashes": [
@@ -797,26 +1348,93 @@
},
"werkzeug": {
"hashes": [
- "sha256:2de2a5db0baeae7b2d2664949077c2ac63fbd16d98da0ff71837f7d1dea3fd43",
- "sha256:6c80b1e5ad3665290ea39320b91e1be1e0d5f60652b964a3070216de83d2e47c"
+ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+ "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
+ },
+ "zstandard": {
+ "hashes": [
+ "sha256:04c298d381a3b6274b0a8001f0da0ec7819d052ad9c3b0863fe8c7f154061f76",
+ "sha256:0fde1c56ec118940974e726c2a27e5b54e71e16c6f81d0b4722112b91d2d9009",
+ "sha256:126aa8433773efad0871f624339c7984a9c43913952f77d5abeee7f95a0c0860",
+ "sha256:1a4fb8b4ac6772e4d656103ccaf2e43e45bd16b5da324b963d58ef360d09eb73",
+ "sha256:2e4812720582d0803e84aefa2ac48ce1e1e6e200ca3ce1ae2be6d410c1d637ae",
+ "sha256:2f01b27d0b453f07cbcff01405cdd007e71f5d6410eb01303a16ba19213e58e4",
+ "sha256:31d12fcd942dd8dbf52ca5f6b1bbe287f44e5d551a081a983ff3ea2082867863",
+ "sha256:3c927b6aa682c6d96225e1c797f4a5d0b9f777b327dea912b23471aaf5385376",
+ "sha256:3d5bb598963ac1f1f5b72dd006adb46ca6203e4fb7269a5b6e1f99e85b07ad38",
+ "sha256:401508efe02341ae681752a87e8ac9ef76df85ef1a238a7a21786a489d2c983d",
+ "sha256:4514b19abe6dbd36d6c5d75c54faca24b1ceb3999193c5b1f4b685abeabde3d0",
+ "sha256:47dfa52bed3097c705451bafd56dac26535545a987b6759fa39da1602349d7ba",
+ "sha256:4fa496d2d674c6e9cffc561639d17009d29adee84a27cf1e12d3c9be14aa8feb",
+ "sha256:55a513ec67e85abd8b8b83af8813368036f03e2d29a50fc94033504918273980",
+ "sha256:55b3187e0bed004533149882ef8c24e954321f3be81f8a9ceffe35099b82a0d0",
+ "sha256:593f96718ad906e24d6534187fdade28b611f8ed06e27ba972ba48aecec45fc6",
+ "sha256:5e21032efe673b887464667d09406bab6e16d96b09ad87e80859e3a20b6745b6",
+ "sha256:60a86b7b2b1c300779167cf595e019e61afcc0e20c4838692983a921db9006ac",
+ "sha256:619f9bf37cdb4c3dc9d4120d2a1003f5db9446f3618a323219f408f6a9df6725",
+ "sha256:660b91eca10ee1b44c47843894abe3e6cfd80e50c90dee3123befbf7ca486bd3",
+ "sha256:67710d220af405f5ce22712fa741d85e8b3ada7a457ea419b038469ba379837c",
+ "sha256:6caed86cd47ae93915d9031dc04be5283c275e1a2af2ceff33932071f3eeff4d",
+ "sha256:6d2182e648e79213b3881998b30225b3f4b1f3e681f1c1eaf4cacf19bde1040d",
+ "sha256:72758c9f785831d9d744af282d54c3e0f9db34f7eae521c33798695464993da2",
+ "sha256:74c2637d12eaacb503b0b06efdf55199a11b1d7c580bd3dd9dfe84cac97ef2f6",
+ "sha256:755020d5aeb1b10bffd93d119e7709a2a7475b6ad79c8d5226cea3f76d152ce0",
+ "sha256:7ccc4727300f223184520a6064c161a90b5d0283accd72d1455bcd85ec44dd0d",
+ "sha256:81ab21d03e3b0351847a86a0b298b297fde1e152752614138021d6d16a476ea6",
+ "sha256:8371217dff635cfc0220db2720fc3ce728cd47e72bb7572cca035332823dbdfc",
+ "sha256:876567136b0359f6581ecd892bdb4ca03a0eead0265db73206c78cff03bcdb0f",
+ "sha256:879411d04068bd489db57dcf6b82ffad3c5fb2a1fdd30817c566d8b7bedee442",
+ "sha256:898500957ae5e7f31b7271ace4e6f3625b38c0ac84e8cedde8de3a77a7fdae5e",
+ "sha256:8c9ca56345b0c5574db47560603de9d05f63cce5dfeb3a456eb60f3fec737ff2",
+ "sha256:8ec2c146e10b59c376b6bc0369929647fcd95404a503a7aa0990f21c16462248",
+ "sha256:8f7c68de4f362c1b2f426395fe4e05028c56d0782b2ec3ae18a5416eaf775576",
+ "sha256:909bdd4e19ea437eb9b45d6695d722f6f0fd9d8f493e837d70f92062b9f39faf",
+ "sha256:9d97c713433087ba5cee61a3e8edb54029753d45a4288ad61a176fa4718033ce",
+ "sha256:a65e0119ad39e855427520f7829618f78eb2824aa05e63ff19b466080cd99210",
+ "sha256:aa9087571729c968cd853d54b3f6e9d0ec61e45cd2c31e0eb8a0d4bdbbe6da2f",
+ "sha256:aef0889417eda2db000d791f9739f5cecb9ccdd45c98f82c6be531bdc67ff0f2",
+ "sha256:b253d0c53c8ee12c3e53d181fb9ef6ce2cd9c41cbca1c56a535e4fc8ec41e241",
+ "sha256:b80f6f6478f9d4ca26daee6c61584499493bf97950cfaa1a02b16bb5c2c17e70",
+ "sha256:be6329b5ba18ec5d32dc26181e0148e423347ed936dda48bf49fb243895d1566",
+ "sha256:c7560f622e3849cc8f3e999791a915addd08fafe80b47fcf3ffbda5b5151047c",
+ "sha256:d1a7a716bb04b1c3c4a707e38e2dee46ac544fff931e66d7ae944f3019fc55b8",
+ "sha256:d63b04e16df8ea21dfcedbf5a60e11cbba9d835d44cb3cbff233cfd037a916d5",
+ "sha256:d777d239036815e9b3a093fa9208ad314c040c26d7246617e70e23025b60083a",
+ "sha256:e892d3177380ec080550b56a7ffeab680af25575d291766bdd875147ba246a91",
+ "sha256:e9c90a44470f2999779057aeaf33461cbd8bb59d8f15e983150d10bb260e16e0",
+ "sha256:f097dda5d4f9b9b01b3c9fa2069f9c02929365f48f341feddf3d6b32510a2f93",
+ "sha256:f4ebfe03cbae821ef994b2e58e4df6a087470cc522aca502614e82a143365d45"
],
- "version": "==1.0.1"
+ "index": "ia",
+ "version": "==0.19.0"
}
},
"develop": {
"astroid": {
"hashes": [
- "sha256:2f4078c2a41bf377eea06d71c9d2ba4eb8f6b1af2135bec27bbbb7d8f12bb703",
- "sha256:bc58d83eb610252fd8de6363e39d4f1d0619c894b0ed24603b881c02e64c7386"
+ "sha256:10e0ad5f7b79c435179d0d0f0df69998c4eef4597534aae44910db060baeb907",
+ "sha256:1493fe8bd3dfd73dc35bd53c9d5b6e49ead98497c47b2307662556a5692d29d7"
+ ],
+ "markers": "python_full_version >= '3.7.2'",
+ "version": "==2.12.13"
+ },
+ "asttokens": {
+ "hashes": [
+ "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3",
+ "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c"
],
- "version": "==2.4.2"
+ "version": "==2.2.1"
},
"attrs": {
"hashes": [
- "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
- "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
+ "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
+ "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"
],
- "version": "==19.3.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==22.2.0"
},
"backcall": {
"hashes": [
@@ -825,78 +1443,146 @@
],
"version": "==0.2.0"
},
+ "black": {
+ "hashes": [
+ "sha256:101c69b23df9b44247bd88e1d7e90154336ac4992502d4197bdac35dd7ee3320",
+ "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351",
+ "sha256:1f58cbe16dfe8c12b7434e50ff889fa479072096d79f0a7f25e4ab8e94cd8350",
+ "sha256:229351e5a18ca30f447bf724d007f890f97e13af070bb6ad4c0a441cd7596a2f",
+ "sha256:436cc9167dd28040ad90d3b404aec22cedf24a6e4d7de221bec2730ec0c97bcf",
+ "sha256:559c7a1ba9a006226f09e4916060982fd27334ae1998e7a38b3f33a37f7a2148",
+ "sha256:7412e75863aa5c5411886804678b7d083c7c28421210180d67dfd8cf1221e1f4",
+ "sha256:77d86c9f3db9b1bf6761244bc0b3572a546f5fe37917a044e02f3166d5aafa7d",
+ "sha256:82d9fe8fee3401e02e79767016b4907820a7dc28d70d137eb397b92ef3cc5bfc",
+ "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d",
+ "sha256:c116eed0efb9ff870ded8b62fe9f28dd61ef6e9ddd28d83d7d264a38417dcee2",
+ "sha256:d30b212bffeb1e252b31dd269dfae69dd17e06d92b87ad26e23890f3efea366f"
+ ],
+ "index": "ia",
+ "version": "==22.12.0"
+ },
"certifi": {
"hashes": [
- "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3",
- "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2020.6.20"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
- "chardet": {
+ "charset-normalizer": {
"hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==3.0.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
- "coverage": {
+ "click": {
"hashes": [
- "sha256:00f1d23f4336efc3b311ed0d807feb45098fc86dee1ca13b3d6768cdab187c8a",
- "sha256:01333e1bd22c59713ba8a79f088b3955946e293114479bbfc2e37d522be03355",
- "sha256:0cb4be7e784dcdc050fc58ef05b71aa8e89b7e6636b99967fadbdba694cf2b65",
- "sha256:0e61d9803d5851849c24f78227939c701ced6704f337cad0a91e0972c51c1ee7",
- "sha256:1601e480b9b99697a570cea7ef749e88123c04b92d84cedaa01e117436b4a0a9",
- "sha256:2742c7515b9eb368718cd091bad1a1b44135cc72468c731302b3d641895b83d1",
- "sha256:2d27a3f742c98e5c6b461ee6ef7287400a1956c11421eb574d843d9ec1f772f0",
- "sha256:402e1744733df483b93abbf209283898e9f0d67470707e3c7516d84f48524f55",
- "sha256:5c542d1e62eece33c306d66fe0a5c4f7f7b3c08fecc46ead86d7916684b36d6c",
- "sha256:5f2294dbf7875b991c381e3d5af2bcc3494d836affa52b809c91697449d0eda6",
- "sha256:6402bd2fdedabbdb63a316308142597534ea8e1895f4e7d8bf7476c5e8751fef",
- "sha256:66460ab1599d3cf894bb6baee8c684788819b71a5dc1e8fa2ecc152e5d752019",
- "sha256:782caea581a6e9ff75eccda79287daefd1d2631cc09d642b6ee2d6da21fc0a4e",
- "sha256:79a3cfd6346ce6c13145731d39db47b7a7b859c0272f02cdb89a3bdcbae233a0",
- "sha256:7a5bdad4edec57b5fb8dae7d3ee58622d626fd3a0be0dfceda162a7035885ecf",
- "sha256:8fa0cbc7ecad630e5b0f4f35b0f6ad419246b02bc750de7ac66db92667996d24",
- "sha256:a027ef0492ede1e03a8054e3c37b8def89a1e3c471482e9f046906ba4f2aafd2",
- "sha256:a3f3654d5734a3ece152636aad89f58afc9213c6520062db3978239db122f03c",
- "sha256:a82b92b04a23d3c8a581fc049228bafde988abacba397d57ce95fe95e0338ab4",
- "sha256:acf3763ed01af8410fc36afea23707d4ea58ba7e86a8ee915dfb9ceff9ef69d0",
- "sha256:adeb4c5b608574a3d647011af36f7586811a2c1197c861aedb548dd2453b41cd",
- "sha256:b83835506dfc185a319031cf853fa4bb1b3974b1f913f5bb1a0f3d98bdcded04",
- "sha256:bb28a7245de68bf29f6fb199545d072d1036a1917dca17a1e75bbb919e14ee8e",
- "sha256:bf9cb9a9fd8891e7efd2d44deb24b86d647394b9705b744ff6f8261e6f29a730",
- "sha256:c317eaf5ff46a34305b202e73404f55f7389ef834b8dbf4da09b9b9b37f76dd2",
- "sha256:dbe8c6ae7534b5b024296464f387d57c13caa942f6d8e6e0346f27e509f0f768",
- "sha256:de807ae933cfb7f0c7d9d981a053772452217df2bf38e7e6267c9cbf9545a796",
- "sha256:dead2ddede4c7ba6cb3a721870f5141c97dc7d85a079edb4bd8d88c3ad5b20c7",
- "sha256:dec5202bfe6f672d4511086e125db035a52b00f1648d6407cc8e526912c0353a",
- "sha256:e1ea316102ea1e1770724db01998d1603ed921c54a86a2efcb03428d5417e489",
- "sha256:f90bfc4ad18450c80b024036eaf91e4a246ae287701aaa88eaebebf150868052"
- ],
- "version": "==5.1"
+ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
+ "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.3"
+ },
+ "coverage": {
+ "extras": [
+ "toml"
+ ],
+ "hashes": [
+ "sha256:07bcfb1d8ac94af886b54e18a88b393f6a73d5959bb31e46644a02453c36e475",
+ "sha256:09f6b5a8415b6b3e136d5fec62b552972187265cb705097bf030eb9d4ffb9b60",
+ "sha256:0a79137fc99815fff6a852c233628e735ec15903cfd16da0f229d9c4d45926ab",
+ "sha256:0b4b3a4d9915b2be879aff6299c0a6129f3d08a775d5a061f503cf79571f73e4",
+ "sha256:1285648428a6101b5f41a18991c84f1c3959cee359e51b8375c5882fc364a13f",
+ "sha256:12a5aa77783d49e05439fbe6e6b427484f8a0f9f456b46a51d8aac022cfd024d",
+ "sha256:19ec666533f0f70a0993f88b8273057b96c07b9d26457b41863ccd021a043b9a",
+ "sha256:1e414dc32ee5c3f36544ea466b6f52f28a7af788653744b8570d0bf12ff34bc0",
+ "sha256:2c44fcfb3781b41409d0f060a4ed748537557de9362a8a9282182fafb7a76ab4",
+ "sha256:397b4a923cc7566bbc7ae2dfd0ba5a039b61d19c740f1373791f2ebd11caea59",
+ "sha256:3cfc595d2af13856505631be072835c59f1acf30028d1c860b435c5fc9c15b69",
+ "sha256:3dd4ee135e08037f458425b8842d24a95a0961831a33f89685ff86b77d378f89",
+ "sha256:486ee81fa694b4b796fc5617e376326a088f7b9729c74d9defa211813f3861e4",
+ "sha256:4f943a3b2bc520102dd3e0bb465e1286e12c9a54f58accd71b9e65324d9c7c01",
+ "sha256:63d56165a7c76265468d7e0c5548215a5ba515fc2cba5232d17df97bffa10f6c",
+ "sha256:66b18c3cf8bbab0cce0d7b9e4262dc830e93588986865a8c78ab2ae324b3ed56",
+ "sha256:691571f31ace1837838b7e421d3a09a8c00b4aac32efacb4fc9bd0a5c647d25a",
+ "sha256:6c5ad996c6fa4d8ed669cfa1e8551348729d008a2caf81489ab9ea67cfbc7498",
+ "sha256:6d55d840e1b8c0002fce66443e124e8581f30f9ead2e54fbf6709fb593181f2c",
+ "sha256:72d1507f152abacea81f65fee38e4ef3ac3c02ff8bc16f21d935fd3a8a4ad910",
+ "sha256:74f70cd92669394eaf8d7756d1b195c8032cf7bbbdfce3bc489d4e15b3b8cf73",
+ "sha256:830525361249dc4cd013652b0efad645a385707a5ae49350c894b67d23fbb07c",
+ "sha256:854f22fa361d1ff914c7efa347398374cc7d567bdafa48ac3aa22334650dfba2",
+ "sha256:89caf4425fe88889e2973a8e9a3f6f5f9bbe5dd411d7d521e86428c08a873a4a",
+ "sha256:9158f8fb06747ac17bd237930c4372336edc85b6e13bdc778e60f9d685c3ca37",
+ "sha256:92651580bd46519067e36493acb394ea0607b55b45bd81dd4e26379ed1871f55",
+ "sha256:978258fec36c154b5e250d356c59af7d4c3ba02bef4b99cda90b6029441d797d",
+ "sha256:9823e4789ab70f3ec88724bba1a203f2856331986cd893dedbe3e23a6cfc1e4e",
+ "sha256:9b373c9345c584bb4b5f5b8840df7f4ab48c4cbb7934b58d52c57020d911b856",
+ "sha256:a4a574a19eeb67575a5328a5760bbbb737faa685616586a9f9da4281f940109c",
+ "sha256:aec2d1515d9d39ff270059fd3afbb3b44e6ec5758af73caf18991807138c7118",
+ "sha256:b3695c4f4750bca943b3e1f74ad4be8d29e4aeab927d50772c41359107bd5d5c",
+ "sha256:b3763e7fcade2ff6c8e62340af9277f54336920489ceb6a8cd6cc96da52fcc62",
+ "sha256:b66bb21a23680dee0be66557dc6b02a3152ddb55edf9f6723fa4a93368f7158d",
+ "sha256:b6f22bb64cc39bcb883e5910f99a27b200fdc14cdd79df8696fa96b0005c9444",
+ "sha256:b77015d1cb8fe941be1222a5a8b4e3fbca88180cfa7e2d4a4e58aeabadef0ab7",
+ "sha256:b9ea158775c7c2d3e54530a92da79496fb3fb577c876eec761c23e028f1e216c",
+ "sha256:c20cfebcc149a4c212f6491a5f9ff56f41829cd4f607b5be71bb2d530ef243b1",
+ "sha256:cfded268092a84605f1cc19e5c737f9ce630a8900a3589e9289622db161967e9",
+ "sha256:d1991f1dd95eba69d2cd7708ff6c2bbd2426160ffc73c2b81f617a053ebcb1a8",
+ "sha256:d3022c3007d3267a880b5adcf18c2a9bf1fc64469b394a804886b401959b8742",
+ "sha256:d6814854c02cbcd9c873c0f3286a02e3ac1250625cca822ca6bc1018c5b19f1c",
+ "sha256:d87717959d4d0ee9db08a0f1d80d21eb585aafe30f9b0a54ecf779a69cb015f6",
+ "sha256:e00c14720b8b3b6c23b487e70bd406abafc976ddc50490f645166f111c419c39",
+ "sha256:e60bef2e2416f15fdc05772bf87db06c6a6f9870d1db08fdd019fbec98ae24a9",
+ "sha256:e78e9dcbf4f3853d3ae18a8f9272111242531535ec9e1009fa8ec4a2b74557dc",
+ "sha256:f66460f17c9319ea4f91c165d46840314f0a7c004720b20be58594d162a441d8",
+ "sha256:fa6a5a224b7f4cfb226f4fc55a57e8537fcc096f42219128c2c74c0e7d0953e1",
+ "sha256:fb992c47cb1e5bd6a01e97182400bcc2ba2077080a17fcd7be23aaa6e572e390",
+ "sha256:fd1b9c5adc066db699ccf7fa839189a649afcdd9e02cb5dc9d24e67e7922737d",
+ "sha256:fd556ff16a57a070ce4f31c635953cc44e25244f91a0378c6e9bdfd40fdb249f"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==7.0.1"
},
"decorator": {
"hashes": [
- "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
- "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
+ },
+ "dill": {
+ "hashes": [
+ "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0",
+ "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"
],
- "version": "==4.4.2"
+ "markers": "python_version < '3.11'",
+ "version": "==0.3.6"
+ },
+ "executing": {
+ "hashes": [
+ "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc",
+ "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"
+ ],
+ "version": "==1.2.0"
},
"flake8": {
"hashes": [
- "sha256:15e351d19611c887e482fb960eae4d44845013cc142d42896e9862f775d8cf5c",
- "sha256:f04b9fcbac03b0a3e58c0ab3a0ecc462e023a9faf046d57794184028123aa208"
+ "sha256:3833794e27ff64ea4e9cf5d410082a8b97ff1a06c16aa3d2027339cd0f1195c7",
+ "sha256:c61007e76655af75e6785a931f452915b371dc48f56efd765247c8fe68f2b181"
],
"index": "ia",
- "version": "==3.8.3"
+ "version": "==6.0.0"
},
"flake8-annotations": {
"hashes": [
- "sha256:9091d920406a7ff10e401e0dd1baa396d1d7d2e3d101a9beecf815f5894ad554",
- "sha256:f59fdceb8c8f380a20aed20e1ba8a57bde05935958166c52be2249f113f7ab75"
+ "sha256:11f09efb99ae63c8f9d6b492b75fe147fbc323179fddfe00b2e56eefeca42f57",
+ "sha256:a4385158a7a9fc8af1d8820a2f4c8d03387997006a83f5f8bfe5bc6085bdf88a"
],
"index": "ia",
- "version": "==2.1.0"
+ "version": "==2.9.1"
},
"idna": {
"hashes": [
@@ -905,108 +1591,113 @@
],
"version": "==2.6"
},
- "importlab": {
- "hashes": [
- "sha256:d855350d19dc10a17aabd2fe6f4b428ff1a936071f692fbf686a73694d26a51c"
- ],
- "version": "==0.5.1"
- },
- "importlib-metadata": {
+ "iniconfig": {
"hashes": [
- "sha256:0505dd08068cfec00f53a74a0ad927676d7757da81b7436a6eefe4c7cf75c545",
- "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958"
+ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+ "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
- "markers": "python_version < '3.8'",
- "version": "==1.6.1"
+ "version": "==1.1.1"
},
"ipython": {
"hashes": [
- "sha256:0ef1433879816a960cd3ae1ae1dc82c64732ca75cec8dab5a4e29783fb571d0e",
- "sha256:1b85d65632211bf5d3e6f1406f3393c8c429a47d7b947b9a87812aa5bce6595c"
+ "sha256:352042ddcb019f7c04e48171b4dd78e4c4bb67bf97030d170e154aac42b656d9",
+ "sha256:882899fe78d5417a0aa07f995db298fa28b58faeba2112d2e3a4c95fe14bb738"
],
"index": "ia",
- "version": "==7.15.0"
- },
- "ipython-genutils": {
- "hashes": [
- "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
- "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
- ],
- "version": "==0.2.0"
+ "version": "==8.7.0"
},
"isort": {
"hashes": [
- "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1",
- "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd"
+ "sha256:6db30c5ded9815d813932c04c2f85a360bcdd35fed496f4d8f35495ef0a261b6",
+ "sha256:c033fd0edb91000a7f09527fe5c75321878f98322a77ddcc81adbd83724afb7b"
],
- "version": "==4.3.21"
+ "index": "ia",
+ "version": "==5.11.4"
},
"jedi": {
"hashes": [
- "sha256:1ddb0ec78059e8e27ec9eb5098360b4ea0a3dd840bedf21415ea820c21b40a22",
- "sha256:807d5d4f96711a2bcfdd5dfa3b1ae6d09aa53832b182090b222b5efb81f52f63"
+ "sha256:203c1fd9d969ab8f2119ec0a3342e0b49910045abe6af0a3ae83a5764d54639e",
+ "sha256:bae794c30d07f6d910d32a7048af09b5a39ed740918da923c6b780790ebac612"
],
- "version": "==0.17.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.18.2"
},
"lazy-object-proxy": {
"hashes": [
- "sha256:0c4b206227a8097f05c4dbdd323c50edf81f15db3b8dc064d08c62d37e1a504d",
- "sha256:194d092e6f246b906e8f70884e620e459fc54db3259e60cf69a4d66c3fda3449",
- "sha256:1be7e4c9f96948003609aa6c974ae59830a6baecc5376c25c92d7d697e684c08",
- "sha256:4677f594e474c91da97f489fea5b7daa17b5517190899cf213697e48d3902f5a",
- "sha256:48dab84ebd4831077b150572aec802f303117c8cc5c871e182447281ebf3ac50",
- "sha256:5541cada25cd173702dbd99f8e22434105456314462326f06dba3e180f203dfd",
- "sha256:59f79fef100b09564bc2df42ea2d8d21a64fdcda64979c0fa3db7bdaabaf6239",
- "sha256:8d859b89baf8ef7f8bc6b00aa20316483d67f0b1cbf422f5b4dc56701c8f2ffb",
- "sha256:9254f4358b9b541e3441b007a0ea0764b9d056afdeafc1a5569eee1cc6c1b9ea",
- "sha256:9651375199045a358eb6741df3e02a651e0330be090b3bc79f6d0de31a80ec3e",
- "sha256:97bb5884f6f1cdce0099f86b907aa41c970c3c672ac8b9c8352789e103cf3156",
- "sha256:9b15f3f4c0f35727d3a0fba4b770b3c4ebbb1fa907dbcc046a1d2799f3edd142",
- "sha256:a2238e9d1bb71a56cd710611a1614d1194dc10a175c1e08d75e1a7bcc250d442",
- "sha256:a6ae12d08c0bf9909ce12385803a543bfe99b95fe01e752536a60af2b7797c62",
- "sha256:ca0a928a3ddbc5725be2dd1cf895ec0a254798915fb3a36af0964a0a4149e3db",
- "sha256:cb2c7c57005a6804ab66f106ceb8482da55f5314b7fcb06551db1edae4ad1531",
- "sha256:d74bb8693bf9cf75ac3b47a54d716bbb1a92648d5f781fc799347cfc95952383",
- "sha256:d945239a5639b3ff35b70a88c5f2f491913eb94871780ebfabb2568bd58afc5a",
- "sha256:eba7011090323c1dadf18b3b689845fd96a61ba0a1dfbd7f24b921398affc357",
- "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4",
- "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0"
- ],
- "version": "==1.4.3"
+ "sha256:0c1c7c0433154bb7c54185714c6929acc0ba04ee1b167314a779b9025517eada",
+ "sha256:14010b49a2f56ec4943b6cf925f597b534ee2fe1f0738c84b3bce0c1a11ff10d",
+ "sha256:4e2d9f764f1befd8bdc97673261b8bb888764dfdbd7a4d8f55e4fbcabb8c3fb7",
+ "sha256:4fd031589121ad46e293629b39604031d354043bb5cdf83da4e93c2d7f3389fe",
+ "sha256:5b51d6f3bfeb289dfd4e95de2ecd464cd51982fe6f00e2be1d0bf94864d58acd",
+ "sha256:6850e4aeca6d0df35bb06e05c8b934ff7c533734eb51d0ceb2d63696f1e6030c",
+ "sha256:6f593f26c470a379cf7f5bc6db6b5f1722353e7bf937b8d0d0b3fba911998858",
+ "sha256:71d9ae8a82203511a6f60ca5a1b9f8ad201cac0fc75038b2dc5fa519589c9288",
+ "sha256:7e1561626c49cb394268edd00501b289053a652ed762c58e1081224c8d881cec",
+ "sha256:8f6ce2118a90efa7f62dd38c7dbfffd42f468b180287b748626293bf12ed468f",
+ "sha256:ae032743794fba4d171b5b67310d69176287b5bf82a21f588282406a79498891",
+ "sha256:afcaa24e48bb23b3be31e329deb3f1858f1f1df86aea3d70cb5c8578bfe5261c",
+ "sha256:b70d6e7a332eb0217e7872a73926ad4fdc14f846e85ad6749ad111084e76df25",
+ "sha256:c219a00245af0f6fa4e95901ed28044544f50152840c5b6a3e7b2568db34d156",
+ "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8",
+ "sha256:d176f392dbbdaacccf15919c77f526edf11a34aece58b55ab58539807b85436f",
+ "sha256:e20bfa6db17a39c706d24f82df8352488d2943a3b7ce7d4c22579cb89ca8896e",
+ "sha256:eac3a9a5ef13b332c059772fd40b4b1c3d45a3a2b05e33a361dee48e54a4dad0",
+ "sha256:eb329f8d8145379bf5dbe722182410fe8863d186e51bf034d2075eb8d85ee25b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.8.0"
+ },
+ "matplotlib-inline": {
+ "hashes": [
+ "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311",
+ "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==0.1.6"
},
"mccabe": {
"hashes": [
- "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
- "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+ "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
],
- "version": "==0.6.1"
- },
- "more-itertools": {
- "hashes": [
- "sha256:68c70cc7167bdf5c7c9d8f6954a7837089c6a36bf565383919bb595efb8a17e5",
- "sha256:b78134b2063dd214000685165d81c154522c3ee0a1c0d4d113c80361c234c5a2"
- ],
- "version": "==8.4.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.7.0"
},
"mypy": {
"hashes": [
- "sha256:2c6cde8aa3426c1682d35190b59b71f661237d74b053822ea3d748e2c9578a7c",
- "sha256:3fdda71c067d3ddfb21da4b80e2686b71e9e5c72cca65fa216d207a358827f86",
- "sha256:5dd13ff1f2a97f94540fd37a49e5d255950ebcdf446fb597463a40d0df3fac8b",
- "sha256:6731603dfe0ce4352c555c6284c6db0dc935b685e9ce2e4cf220abe1e14386fd",
- "sha256:6bb93479caa6619d21d6e7160c552c1193f6952f0668cdda2f851156e85186fc",
- "sha256:81c7908b94239c4010e16642c9102bfc958ab14e36048fa77d0be3289dda76ea",
- "sha256:9c7a9a7ceb2871ba4bac1cf7217a7dd9ccd44c27c2950edbc6dc08530f32ad4e",
- "sha256:a4a2cbcfc4cbf45cd126f531dedda8485671545b43107ded25ce952aac6fb308",
- "sha256:b7fbfabdbcc78c4f6fc4712544b9b0d6bf171069c6e0e3cb82440dd10ced3406",
- "sha256:c05b9e4fb1d8a41d41dec8786c94f3b95d3c5f528298d769eb8e73d293abc48d",
- "sha256:d7df6eddb6054d21ca4d3c6249cae5578cb4602951fd2b6ee2f5510ffb098707",
- "sha256:e0b61738ab504e656d1fe4ff0c0601387a5489ca122d55390ade31f9ca0e252d",
- "sha256:eff7d4a85e9eea55afa34888dfeaccde99e7520b51f867ac28a48492c0b1130c",
- "sha256:f05644db6779387ccdb468cc47a44b4356fc2ffa9287135d05b70a98dc83b89a"
+ "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d",
+ "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6",
+ "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf",
+ "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f",
+ "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813",
+ "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33",
+ "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad",
+ "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05",
+ "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297",
+ "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06",
+ "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd",
+ "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243",
+ "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305",
+ "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476",
+ "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711",
+ "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70",
+ "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5",
+ "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461",
+ "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab",
+ "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c",
+ "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d",
+ "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135",
+ "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93",
+ "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648",
+ "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a",
+ "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb",
+ "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3",
+ "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372",
+ "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb",
+ "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"
],
"index": "ia",
- "version": "==0.782"
+ "version": "==0.991"
},
"mypy-extensions": {
"hashes": [
@@ -1015,42 +1706,29 @@
],
"version": "==0.4.3"
},
- "networkx": {
- "hashes": [
- "sha256:cdfbf698749a5014bf2ed9db4a07a5295df1d3a53bf80bf3cbd61edf9df05fa1",
- "sha256:f8f4ff0b6f96e4f9b16af6b84622597b5334bf9cae8cf9b2e42e7985d5c95c64"
- ],
- "version": "==2.4"
- },
- "ninja": {
+ "packaging": {
"hashes": [
- "sha256:18bd4ebc6cef30981e966609362090a0d99aeca29a63ca83a3688305f1c35222",
- "sha256:39f9ab35f52b540777b77cc889ffed37182c7d55bec00f658f6f74bd5b1a4377",
- "sha256:3c206a4b8b896f396aeabfc0dbd99d84bc01306a3e07568d28d5536c24cbeaa3",
- "sha256:3d4b1a3fa4d68c9dc74f50875c9bfe4eaaf495b5205d12526aea95043488c8b6",
- "sha256:5ae857e0283acbf4b3645756d9e8217fddbe1f41dfe33e2c40dc79cb69706a8c",
- "sha256:607211b652a32006cda8a72a1496c348ddadcbe30986ff264e7354972fa3194e",
- "sha256:6ba8b42193600bfbde76dc32d7f6fd5675e253a9e5d7caad4a2735a84a72d491",
- "sha256:760de263a261919fc97cf1fd30d2dd8902dd89d5165d6cbf80ce3d66a39fff11",
- "sha256:9897b92c626caabe51fce04a9be851f635ed828a55c558a9cf1a75571b4c4fce",
- "sha256:ddfac074ae408e42c617cd44f90a95bf6db94f0c846c95ef2a3a9a03438027a1",
- "sha256:fa6d68b4f65aca57594d3cccfcf8fa7c8a311e93c55eed8043cabc439617d7b7"
+ "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+ "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
],
- "version": "==1.10.0.post1"
+ "markers": "python_version >= '3.7'",
+ "version": "==22.0"
},
- "packaging": {
+ "parso": {
"hashes": [
- "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8",
- "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181"
+ "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0",
+ "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"
],
- "version": "==20.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
- "parso": {
+ "pathspec": {
"hashes": [
- "sha256:158c140fc04112dc45bca311633ae5033c2c2a7b732fa33d0955bad8152a8dd0",
- "sha256:908e9fae2144a076d72ae4e25539143d40b8e3eafbaeae03c1bfe226f4cdf12c"
+ "sha256:3c95343af8b756205e2aba76e843ba9520a24dd84f68c22b9f93251507509dd6",
+ "sha256:56200de4077d9d0791465aa9095a01d421861e405b5096955051deefd697d6f6"
],
- "version": "==0.7.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.10.3"
},
"pexpect": {
"hashes": [
@@ -1067,210 +1745,255 @@
],
"version": "==0.7.5"
},
+ "platformdirs": {
+ "hashes": [
+ "sha256:1a89a12377800c81983db6be069ec068eee989748799b946cce2a6e80dcc54ca",
+ "sha256:b46ffafa316e6b83b47489d240ce17173f123a9b9c83282141c3daf26ad9ac2e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.6.0"
+ },
"pluggy": {
"hashes": [
- "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
- "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
+ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+ "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
- "version": "==0.13.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.0.0"
},
"prompt-toolkit": {
"hashes": [
- "sha256:563d1a4140b63ff9dd587bda9557cffb2fe73650205ab6f4383092fb882e7dc8",
- "sha256:df7e9e63aea609b1da3a65641ceaf5bc7d05e0a04de5bd45d05dbeffbabf9e04"
+ "sha256:3e163f254bef5a03b146397d7c1963bd3e2812f0964bb9a24e6ec761fd28db63",
+ "sha256:aa64ad242a462c5ff0363a7b9cfe696c20d55d9fc60c11fd8e632d064804d305"
],
- "version": "==3.0.5"
+ "markers": "python_full_version >= '3.6.2'",
+ "version": "==3.0.36"
},
"ptyprocess": {
"hashes": [
- "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
- "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35",
+ "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"
],
- "version": "==0.6.0"
+ "version": "==0.7.0"
+ },
+ "pure-eval": {
+ "hashes": [
+ "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350",
+ "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"
+ ],
+ "version": "==0.2.2"
},
"py": {
"hashes": [
- "sha256:a673fa23d7000440cc885c17dbd34fafcb7d7a6e230b29f6766400de36a33c44",
- "sha256:f3b3a4c36512a4c4f024041ab51866f11761cc169670204b235f6b20523d4e6b"
+ "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
+ "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
- "version": "==1.8.2"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.11.0"
},
"pycodestyle": {
"hashes": [
- "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367",
- "sha256:c58a7d2815e0e8d7972bf1803331fb0152f867bd89adf8a01dfd55085434192e"
+ "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053",
+ "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"
],
- "version": "==2.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.10.0"
},
"pyflakes": {
"hashes": [
- "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92",
- "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"
+ "sha256:ec55bf7fe21fff7f1ad2f7da62363d749e2a470500eab1b555334b67aa1ef8cf",
+ "sha256:ec8b276a6b60bd80defed25add7e439881c19e64850afd9b346283d4165fd0fd"
],
- "version": "==2.2.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.1"
},
"pygments": {
"hashes": [
- "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44",
- "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"
+ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
+ "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "version": "==2.6.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.13.0"
},
"pylint": {
"hashes": [
- "sha256:7dd78437f2d8d019717dbf287772d0b2dbdfd13fc016aa7faa08d67bccc46adc",
- "sha256:d0ece7d223fe422088b0e8f13fa0a1e8eb745ebffcb8ed53d3e95394b6101a1c"
+ "sha256:18783cca3cfee5b83c6c5d10b3cdb66c6594520ffae61890858fe8d932e1c6b4",
+ "sha256:349c8cd36aede4d50a0754a8c0218b43323d13d5d88f4b2952ddfe3e169681eb"
],
"index": "ia",
- "version": "==2.5.3"
- },
- "pyparsing": {
- "hashes": [
- "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
- "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
- ],
- "version": "==2.4.7"
+ "version": "==2.15.9"
},
"pytest": {
"hashes": [
- "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1",
- "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"
+ "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+ "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
],
"index": "ia",
- "version": "==5.4.3"
+ "version": "==6.2.5"
},
"pytest-cov": {
"hashes": [
- "sha256:1a629dc9f48e53512fcbfda6b07de490c374b0c83c55ff7a1720b3fccff0ac87",
- "sha256:6e6d18092dce6fad667cd7020deed816f858ad3b49d5b5e2b1cc1c97a4dba65c"
+ "sha256:2feb1b751d66a8bd934e5edfa2e961d11309dc37b73b0eabe73b5945fee20f6b",
+ "sha256:996b79efde6433cdbd0088872dbc5fb3ed7fe1578b68cdbba634f14bb8dd0470"
],
"index": "ia",
- "version": "==2.10.0"
+ "version": "==4.0.0"
},
"pytest-mock": {
"hashes": [
- "sha256:636e792f7dd9e2c80657e174c04bf7aa92672350090736d82e97e92ce8f68737",
- "sha256:a9fedba70e37acf016238bb2293f2652ce19985ceb245bbd3d7f3e4032667402"
+ "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b",
+ "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"
],
"index": "ia",
- "version": "==3.1.1"
+ "version": "==3.10.0"
},
"pytest-pylint": {
"hashes": [
- "sha256:b0c177d63f6e3f5b82fa2720a6570dd2ecff1616c26ed6d02d0cbf75fd98ddf9",
- "sha256:c6a1b9ad7dc819ea56ebd45fc1f5a611f0848b9a5b85fdcd8deafd07b22e7f2e"
+ "sha256:b51d3f93bed9c192e2b046f16520981bee5abe7bd61b070306e7ee685219fdd3",
+ "sha256:d88e83c1023c641548a9ec3567707ceee7616632a986af133426d4a74d066932"
],
"index": "ia",
- "version": "==0.17.0"
+ "version": "==0.19.0"
},
"pytest-pythonpath": {
"hashes": [
- "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ "sha256:64e195b23a8f8c0c631fb16882d9ad6fa4137ed1f2961ddd15d52065cd435db6",
+ "sha256:e73e11dab2f0b83e73229e261242b251f0a369d7f527dbfec068822fd26a6ce5"
],
"index": "ia",
- "version": "==0.7.3"
+ "version": "==0.7.4"
},
- "pytype": {
+ "requests": {
"hashes": [
- "sha256:08ddb9940764492b701a8985c30437239eb2c34003448cca760769264f5ff2f8"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2020.6.1"
+ "version": "==2.28.1"
},
- "pyyaml": {
+ "responses": {
"hashes": [
- "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
- "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
- "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
- "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
- "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
- "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
- "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
- "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
- "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
- "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
- "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
+ "sha256:396acb2a13d25297789a5866b4881cf4e46ffd49cc26c43ab1117f40b973102e",
+ "sha256:dcf294d204d14c436fddcc74caefdbc5764795a40ff4e6a7740ed8ddbf3294be"
],
- "version": "==5.3.1"
+ "index": "ia",
+ "version": "==0.22.0"
},
- "requests": {
+ "six": {
+ "hashes": [
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
+ },
+ "stack-data": {
+ "hashes": [
+ "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815",
+ "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"
+ ],
+ "version": "==0.6.2"
+ },
+ "toml": {
+ "hashes": [
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.10.2"
+ },
+ "tomli": {
+ "hashes": [
+ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+ "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+ ],
+ "version": "==2.0.1"
+ },
+ "tomlkit": {
+ "hashes": [
+ "sha256:07de26b0d8cfc18f871aec595fda24d95b08fef89d147caa861939f37230bf4b",
+ "sha256:71b952e5721688937fb02cf9d354dbcf0785066149d2855e44531ebdd2b65d73"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.11.6"
+ },
+ "traitlets": {
+ "hashes": [
+ "sha256:6cc57d6dc28c85d5365961726ffd19b538739347749e13ebe34e03323a0e8f84",
+ "sha256:c864831efa0ba6576d09b44884b34e41defc18c0d7e720b4a2d6698c842cab3e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==5.8.0"
+ },
+ "types-beautifulsoup4": {
"hashes": [
- "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b",
- "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898"
+ "sha256:c1f803367a2b07ad4fdac40ddbea557010dc4ddd1ee92d801f317eb02e2e3c72",
+ "sha256:d46be8f409ddccb6daaa9d118484185e70bcf552085c39c6d05b157cd1462e04"
],
"index": "ia",
- "version": "==2.24.0"
+ "version": "==4.11.6.1"
},
- "responses": {
+ "types-dateparser": {
"hashes": [
- "sha256:7bb697a5fedeb41d81e8b87f152d453d5cab42dcd1691b6a7d6097e94d33f373",
- "sha256:af94d28cdfb48ded0ad82a5216616631543650f440334a693479b8991a6594a2"
+ "sha256:5b0c8845167981f68f090894aa371bddbd0371341b90c3f868ac9524cd0a6b69",
+ "sha256:65232f1b3a952476fb98b31ae0a4019efd32635981040149b97b161d5ce2b4da"
],
"index": "ia",
- "version": "==0.10.15"
+ "version": "==1.1.4.4"
},
- "six": {
+ "types-pillow": {
"hashes": [
- "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
- "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ "sha256:98b8484ff343676f6f7051682a6cfd26896e993e86b3ce9badfa0ec8750f5405",
+ "sha256:c18d466dc18550d96b8b4a279ff94f0cbad696825b5ad55466604f1daf5709de"
],
- "version": "==1.15.0"
+ "index": "ia",
+ "version": "==9.3.0.4"
},
- "toml": {
+ "types-psycopg2": {
+ "hashes": [
+ "sha256:084558d6bc4b2cfa249b06be0fdd9a14a69d307bae5bb5809a2f14cfbaa7a23f",
+ "sha256:bff045579642ce00b4a3c8f2e401b7f96dfaa34939f10be64b0dd3b53feca57d"
+ ],
+ "index": "ia",
+ "version": "==2.9.21.2"
+ },
+ "types-requests": {
"hashes": [
- "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f",
- "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"
+ "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3",
+ "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"
],
- "version": "==0.10.1"
+ "index": "ia",
+ "version": "==2.28.11.7"
},
- "traitlets": {
+ "types-toml": {
+ "hashes": [
+ "sha256:171bdb3163d79a520560f24ba916a9fc9bff81659c5448a9fea89240923722be",
+ "sha256:b7b5c4977f96ab7b5ac06d8a6590d17c0bf252a96efc03b109c2711fb3e0eafd"
+ ],
+ "version": "==0.10.8.1"
+ },
+ "types-urllib3": {
"hashes": [
- "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44",
- "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7"
- ],
- "version": "==4.3.3"
- },
- "typed-ast": {
- "hashes": [
- "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
- "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
- "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
- "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
- "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
- "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
- "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
- "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
- "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
- "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
- "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
- "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
- "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
- "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
- "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
- "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
- "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
- "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
- "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
- "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
- "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
- ],
- "version": "==1.4.1"
+ "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49",
+ "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"
+ ],
+ "version": "==1.26.25.4"
},
"typing-extensions": {
"hashes": [
- "sha256:6e95524d8a547a91e08f404ae485bbb71962de46967e1b71a0cb89af24e761c5",
- "sha256:79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae",
- "sha256:f8d2bd89d25bc39dabe7d23df520442fa1d8969b82544370e03d88b5a591c392"
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
],
- "version": "==3.7.4.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"urllib3": {
"hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.22"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"wcwidth": {
"hashes": [
@@ -1281,16 +2004,73 @@
},
"wrapt": {
"hashes": [
- "sha256:b62ffa81fb85f4332a4f609cab4ac40709470da05643a082ec1eb88e6d9b97d7"
- ],
- "version": "==1.12.1"
- },
- "zipp": {
- "hashes": [
- "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b",
- "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"
- ],
- "version": "==3.1.0"
+ "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3",
+ "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b",
+ "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4",
+ "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2",
+ "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656",
+ "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3",
+ "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff",
+ "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310",
+ "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a",
+ "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57",
+ "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069",
+ "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383",
+ "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe",
+ "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87",
+ "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d",
+ "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b",
+ "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907",
+ "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f",
+ "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0",
+ "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28",
+ "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1",
+ "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853",
+ "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc",
+ "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3",
+ "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3",
+ "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164",
+ "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1",
+ "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c",
+ "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1",
+ "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7",
+ "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1",
+ "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320",
+ "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed",
+ "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1",
+ "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248",
+ "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c",
+ "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456",
+ "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77",
+ "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef",
+ "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1",
+ "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7",
+ "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86",
+ "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4",
+ "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d",
+ "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d",
+ "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8",
+ "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5",
+ "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471",
+ "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00",
+ "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68",
+ "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3",
+ "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d",
+ "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735",
+ "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d",
+ "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569",
+ "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7",
+ "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59",
+ "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5",
+ "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb",
+ "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b",
+ "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f",
+ "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462",
+ "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015",
+ "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==1.14.1"
}
}
}
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..4395f19
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,46 @@
+
+This directory contains `sandcrawler` python code for ingest pipelines, batch
+processing, PDF extraction, etc.
+
+
+## Development Quickstart
+
+As of December 2022, working with this code requires:
+
+- Python 3.8 (specifically, due to version specification in `pipenv`)
+- `pipenv` for python dependency management
+- generic and python-specific build tools (`pkg-config`, `python-dev`, etc)
+- poppler (PDF processing library)
+- libmagic
+- libsodium
+- access to IA internal packages (`devpi.us.archive.org`), specifically for
+ globalwayback and related packages
+
+In production and CI we use Ubuntu Focal (20.04). The CI script for this
+repository (`../.gitlab-ci.yml`) is the best place to look for a complete list
+of dependencies for both development and deployment. Note that our CI system
+runs from our cluster, which resolves the devpi access issue. For developer
+laptops, you may need `sshuttle` or something similar set up to do initial
+package pulls.
+
+It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when
+working with pipenv. You can include this in a `.env` file.
+
+There is a Makefile which helps with the basics. Eg:
+
+ # install deps using pipenv
+ make deps
+
+ # run python tests
+ make test
+
+ # run code formatting and lint checks
+ make fmt lint
+
+Sometimes when developing it is helpful to enter a shell with pipenv, eg:
+
+ pipenv shell
+
+Often when developing it is helpful (or necessary) to set environment
+variables. `pipenv shell` will read from `.env`, so you can copy and edit
+`example.env`, and it will be used in tests, `pipenv shell`, etc.
diff --git a/python/TODO b/python/TODO
deleted file mode 100644
index 58a463f..0000000
--- a/python/TODO
+++ /dev/null
@@ -1,7 +0,0 @@
-
-ingest crawler:
-- SPNv2 only
- - remove most SPNv1/v2 path selection
-- landing page + fulltext hops only (short recursion depth)
-- use wayback client library instead of requests to fetch content
-- https://pypi.org/project/ratelimit/
diff --git a/python/example.env b/python/example.env
index 4d3baa0..85af66c 100644
--- a/python/example.env
+++ b/python/example.env
@@ -1,7 +1,8 @@
-MINIO_ACCESS_KEY="minioadmin"
-MINIO_SECRET_KEY="minioadmin"
+SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin"
+SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"
IA_ACCESS_KEY="dummy"
IA_SECRET_KEY="dummy"
CDX_AUTH_TOKEN="dummy"
PETABOX_WEBDATA_SECRET="dummy"
SENTRY_DSN=""
+SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/"
diff --git a/python/grobid2json.py b/python/grobid2json.py
deleted file mode 100755
index 0eae6fe..0000000
--- a/python/grobid2json.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-NB: adapted to work as a library for PDF extraction. Will probably be
-re-written eventually to be correct, complete, and robust; this is just a
-first iteration.
-
-This script tries to extract everything from a GROBID TEI XML fulltext dump:
-
-- header metadata
-- affiliations
-- references (with context)
-- abstract
-- fulltext
-- tables, figures, equations
-
-A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
-
-- abstract
-- fulltext
-- tables, figures, equations
-
-Prints JSON to stdout, errors to stderr
-"""
-
-import io
-import json
-import argparse
-import xml.etree.ElementTree as ET
-from typing import List, Any, Dict, AnyStr, Optional
-
-xml_ns = "http://www.w3.org/XML/1998/namespace"
-ns = "http://www.tei-c.org/ns/1.0"
-
-
-def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
- if not elem:
- return []
- names = []
- for author in elem.findall(".//{%s}author" % ns):
- pn = author.find("./{%s}persName" % ns)
- if not pn:
- continue
- given_name = pn.findtext("./{%s}forename" % ns) or None
- surname = pn.findtext("./{%s}surname" % ns) or None
- full_name = " ".join(pn.itertext())
- obj: Dict[str, Any] = dict(name=full_name)
- if given_name:
- obj["given_name"] = given_name
- if surname:
- obj["surname"] = surname
- ae = author.find("./{%s}affiliation" % ns)
- if ae:
- affiliation: Dict[str, Any] = dict()
- for on in ae.findall("./{%s}orgName" % ns):
- on_type = on.get("type")
- if on_type:
- affiliation[on_type] = on.text
- addr_e = ae.find("./{%s}address" % ns)
- if addr_e:
- address = dict()
- for t in addr_e.getchildren():
- address[t.tag.split("}")[-1]] = t.text
- if address:
- affiliation["address"] = address
- # affiliation['address'] = {
- # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
- # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
- # 'country': addr.findtext('./{%s}country' % ns) or None,
- # }
- obj["affiliation"] = affiliation
- names.append(obj)
- return names
-
-
-def journal_info(elem: ET.Element) -> Dict[str, Any]:
- journal = dict()
- journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- journal["publisher"] = elem.findtext(
- ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
- )
- if journal["publisher"] == "":
- journal["publisher"] = None
- journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
- journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
- journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- keys = list(journal.keys())
-
- # remove empty/null keys
- for k in keys:
- if not journal[k]:
- journal.pop(k)
- return journal
-
-
-def biblio_info(elem: ET.Element) -> Dict[str, Any]:
- ref: Dict[str, Any] = dict()
- ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
- # Title stuff is messy in references...
- ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- if other_title:
- if ref["title"]:
- ref["journal"] = other_title
- else:
- ref["journal"] = None
- ref["title"] = other_title
- ref["authors"] = all_authors(elem)
- ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
- if ref["publisher"] == "":
- ref["publisher"] = None
- date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref["date"] = (date is not None) and date.attrib.get("when")
- ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- el = elem.find(".//{%s}ptr[@target]" % ns)
- if el is not None:
- ref["url"] = el.attrib["target"]
- # Hand correction
- if ref["url"].endswith(".Lastaccessed"):
- ref["url"] = ref["url"].replace(".Lastaccessed", "")
- else:
- ref["url"] = None
- return ref
-
-
-def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
-
- if isinstance(content, str):
- tree = ET.parse(io.StringIO(content))
- elif isinstance(content, bytes):
- tree = ET.parse(io.BytesIO(content))
-
- info: Dict[str, Any] = dict()
-
- # print(content)
- # print(content.getvalue())
- tei = tree.getroot()
-
- header = tei.find(".//{%s}teiHeader" % ns)
- if header is None:
- raise ValueError("XML does not look like TEI format")
- application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0]
- info["grobid_version"] = application_tag.attrib["version"].strip()
- info["grobid_timestamp"] = application_tag.attrib["when"].strip()
- info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- info["authors"] = all_authors(
- header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
- )
- info["journal"] = journal_info(header)
- date = header.find('.//{%s}date[@type="published"]' % ns)
- info["date"] = (date is not None) and date.attrib.get("when")
- info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
- info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
- if info["doi"]:
- info["doi"] = info["doi"].lower()
-
- refs = []
- for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))):
- ref = biblio_info(bs)
- ref["index"] = i
- refs.append(ref)
- info["citations"] = refs
-
- text = tei.find(".//{%s}text" % (ns))
- # print(text.attrib)
- if text and text.attrib.get("{%s}lang" % xml_ns):
- info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang
-
- if encumbered:
- el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns))
- info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find(".//{%s}text/{%s}body" % (ns, ns))
- info["body"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
- info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
- info["annex"] = (el or None) and " ".join(el.itertext()).strip()
-
- # remove empty/null keys
- keys = list(info.keys())
- for k in keys:
- if not info[k]:
- info.pop(k)
- return info
-
-
-def main() -> None: # pragma no cover
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- description="GROBID TEI XML to JSON",
- usage="%(prog)s [options] <teifile>...",
- )
- parser.add_argument(
- "--no-encumbered",
- action="store_true",
- help="don't include ambiguously copyright encumbered fields (eg, abstract, body)",
- )
- parser.add_argument("teifiles", nargs="+")
-
- args = parser.parse_args()
-
- for filename in args.teifiles:
- content = open(filename, "r").read()
- print(
- json.dumps(
- teixml2json(content, encumbered=(not args.no_encumbered)),
- sort_keys=True,
- )
- )
-
-
-if __name__ == "__main__": # pragma no cover
- main()
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index fe507a0..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -1,21 +1,28 @@
#!/usr/bin/env python3
-
"""
These are generally for running one-off tasks from the command line. Output
might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
-import sys
-import json
import argparse
-import datetime
+import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
from sandcrawler import *
+from sandcrawler.grobid import CrossrefRefsWorker
+
+
+def run_single(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ resp = grobid_client.process_fulltext(blob=args.pdf_file.read())
+ resp["_metadata"] = grobid_client.metadata(resp)
+ print(json.dumps(resp, sort_keys=True))
def run_extract_json(args):
@@ -30,6 +37,7 @@ def run_extract_json(args):
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
@@ -40,7 +48,7 @@ def run_extract_cdx(args):
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
@@ -49,10 +57,11 @@ def run_extract_cdx(args):
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_extract_zipfile(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
if args.jobs > 1:
@@ -65,6 +74,7 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+
def run_transform(args):
grobid_client = GrobidClient()
for line in args.json_file:
@@ -74,76 +84,115 @@ def run_transform(args):
if args.metadata_only:
out = grobid_client.metadata(line)
else:
- out = teixml2json(line['tei_xml'])
+ tei_doc = parse_document_xml(line["tei_xml"])
+ out = tei_doc.to_legacy_dict()
if out:
- if 'source' in line:
- out['source'] = line['source']
+ if "source" in line:
+ out["source"] = line["source"]
print(json.dumps(out))
+def run_parse_crossref_refs(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ worker = CrossrefRefsWorker(grobid_client, sink=args.sink)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json',
- help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
+ sub_single = subparsers.add_parser("single")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument(
+ "pdf_file",
+ help="path to PDF file to process",
+ type=argparse.FileType("rb"),
+ )
+
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does GROBID extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx',
- help="for each CDX line, fetches PDF and does GROBID extraction")
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does GROBID extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
- help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does GROBID extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
-
- sub_transform = subparsers.add_parser('transform')
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+
+ sub_parse_crossref_refs = subparsers.add_parser(
+ "parse-crossref-refs",
+ help="reads Crossref metadata records, parses any unstructured refs with GROBID",
+ )
+ sub_parse_crossref_refs.set_defaults(func=run_parse_crossref_refs)
+ sub_parse_crossref_refs.add_argument(
+ "json_file",
+ help="JSON-L file to process (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_transform = subparsers.add_parser("transform")
sub_transform.set_defaults(func=run_transform)
- sub_transform.add_argument('--metadata-only',
- action='store_true',
- help="Only pass through bibliographic metadata, not fulltext")
- sub_transform.add_argument('json_file',
+ sub_transform.add_argument(
+ "--metadata-only",
+ action="store_true",
+ help="Only pass through bibliographic metadata, not fulltext",
+ )
+ sub_transform.add_argument(
+ "json_file",
help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.sink = None
if args.kafka_mode:
produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 20c65bb..493c9e7 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python3
-
"""
Input is IA item metadata JSON.
-Ouput is insertable fatcat "match" JSON
+Output is insertable fatcat "match" JSON
- md5
- sha1
@@ -22,87 +21,93 @@ When invoking import matched, be sure to:
--default-mimetype application/pdf
"""
-import sys
import json
+import sys
+from typing import Any, Dict, Optional
-def parse(obj):
- if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
- print('skip: test item', file=sys.stderr)
+
+def parse(obj: dict) -> Optional[Dict[str, Any]]:
+ if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"):
+ print("skip: test item", file=sys.stderr)
return None
extid_type = None
extid = None
- if obj['metadata']['identifier'].startswith('arxiv-'):
- extid_type = 'arxiv'
- extid = obj['metadata'].get('source')
+ if obj["metadata"]["identifier"].startswith("arxiv-"):
+ extid_type = "arxiv"
+ extid = obj["metadata"].get("source")
if not extid:
- print('skip: no source', file=sys.stderr)
+ print("skip: no source", file=sys.stderr)
return None
- assert extid.startswith('http://arxiv.org/abs/')
- extid = extid.replace('http://arxiv.org/abs/', '')
- #print(extid)
- assert '/' in extid or '.' in extid
- if not 'v' in extid or not extid[-1].isdigit():
- print('skip: non-versioned arxiv_id', file=sys.stderr)
+ assert extid.startswith("http://arxiv.org/abs/")
+ extid = extid.replace("http://arxiv.org/abs/", "")
+ # print(extid)
+ assert "/" in extid or "." in extid
+ if "v" not in extid or not extid[-1].isdigit():
+ print("skip: non-versioned arxiv_id", file=sys.stderr)
return None
- elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
- extid_type = 'doi'
- extid = obj['metadata']['identifier-doi']
+ elif obj["metadata"]["identifier"].startswith("paper-doi-10_"):
+ extid_type = "doi"
+ extid = obj["metadata"]["identifier-doi"]
assert extid.startswith("10.")
- elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
- extid_type = 'pmcid'
- extid = obj['metadata']['identifier'].replace('pubmed-', '')
+ elif obj["metadata"]["identifier"].startswith("pubmed-PMC"):
+ extid_type = "pmcid"
+ extid = obj["metadata"]["identifier"].replace("pubmed-", "")
assert extid.startswith("PMC")
int(extid[3:])
- elif obj['metadata']['identifier'].startswith('jstor-'):
- extid_type = 'jstor'
- extid = obj['metadata']['identifier'].replace('jstor-', '')
+ elif obj["metadata"]["identifier"].startswith("jstor-"):
+ extid_type = "jstor"
+ extid = obj["metadata"]["identifier"].replace("jstor-", "")
int(extid)
else:
raise NotImplementedError()
pdf_file = None
- for f in obj['files']:
- if f['source'] == "original" and "PDF" in f['format']:
+ for f in obj["files"]:
+ if f["source"] == "original" and "PDF" in f["format"]:
pdf_file = f
break
if not pdf_file:
- print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr)
- #for f in obj['files']:
+ print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr)
+ # for f in obj['files']:
# print(f['format'], file=sys.stderr)
return None
- assert pdf_file['name'].endswith('.pdf')
+ assert pdf_file["name"].endswith(".pdf")
match = {
- 'md5': pdf_file['md5'],
- 'sha1': pdf_file['sha1'],
- 'size': int(pdf_file['size']),
- 'mimetype': 'application/pdf',
- 'urls': [
+ "md5": pdf_file["md5"],
+ "sha1": pdf_file["sha1"],
+ "size": int(pdf_file["size"]),
+ "mimetype": "application/pdf",
+ "urls": [
"https://archive.org/download/{}/{}".format(
- obj['metadata']['identifier'],
- pdf_file['name']),
+ obj["metadata"]["identifier"], pdf_file["name"]
+ ),
],
- 'cdx': [],
- 'dois': [],
+ "cdx": [],
+ "dois": [],
}
- if extid_type == 'doi':
- match['dois'] = [extid,]
+ if extid_type == "doi":
+ match["dois"] = [
+ extid,
+ ]
else:
match[extid_type] = extid
return match
-def run():
+
+def run() -> None:
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
- if match:
+ if match is not None:
print(json.dumps(match, sort_keys=True))
-if __name__ == '__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/ingest_file.py b/python/ingest_file.py
deleted file mode 100755
index f6f694e..0000000
--- a/python/ingest_file.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import argparse
-
-from http.server import HTTPServer
-from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker
-
-
-def run_single_ingest(args):
- request = dict(
- ingest_type=args.type,
- base_url=args.url,
- ext_ids=dict(doi=args.doi),
- fatcat=dict(release_ident=args.release_id),
- )
- if args.force_recrawl:
- request['force_recrawl'] = True
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- )
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
- return result
-
-def run_requests(args):
- # TODO: switch to using JsonLinePusher
- ingester = IngestFileWorker()
- for l in args.json_file:
- request = json.loads(l.strip())
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
-
-def run_api(args):
- port = 8083
- print("Listening on localhost:{}".format(port))
- server = HTTPServer(('', port), IngestFileRequestHandler)
- server.serve_forever()
-
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- subparsers = parser.add_subparsers()
-
- sub_single= subparsers.add_parser('single',
- help="ingests a single file URL")
- sub_single.set_defaults(func=run_single_ingest)
- sub_single.add_argument('--release-id',
- help="(optional) existing release ident to match to")
- sub_single.add_argument('--doi',
- help="(optional) existing release DOI to match to")
- sub_single.add_argument('--force-recrawl',
- action='store_true',
- help="ignore GWB history and use SPNv2 to re-crawl")
- sub_single.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_single.add_argument('--type',
- default="pdf",
- help="type of ingest (pdf, html, etc)")
- sub_single.add_argument('url',
- help="URL of paper to fetch")
-
- sub_requests = subparsers.add_parser('requests',
- help="takes a series of ingest requests (JSON, per line) and runs each")
- sub_requests.set_defaults(func=run_requests)
- sub_requests.add_argument('json_file',
- help="JSON file (request per line) to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
-
- sub_api = subparsers.add_parser('api',
- help="starts a simple HTTP server that processes ingest requests")
- sub_api.set_defaults(func=run_api)
- sub_api.add_argument('--port',
- help="HTTP port to listen on",
- default=8033, type=int)
-
- args = parser.parse_args()
- if not args.__dict__.get("func"):
- print("tell me what to do!", file=sys.stderr)
- sys.exit(-1)
-
- args.func(args)
-
-if __name__ == '__main__':
- main()
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
new file mode 100755
index 0000000..0b74f9f
--- /dev/null
+++ b/python/ingest_tool.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import subprocess
+import sys
+from http.server import HTTPServer
+
+import sentry_sdk
+
+from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
+from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
+from sandcrawler.ingest_fileset import IngestFilesetWorker
+
+
+def run_single_ingest(args):
+ request = dict(
+ ingest_type=args.ingest_type,
+ base_url=args.url,
+ ext_ids=dict(doi=args.doi),
+ fatcat=dict(release_ident=args.release_id),
+ )
+ if args.force_recrawl:
+ request["force_recrawl"] = True
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ ingester = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ ingest_file_result_stdout=True,
+ )
+ else:
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ grobid_client=grobid_client,
+ )
+ result = ingester.process(request)
+ print(json.dumps(result, sort_keys=True))
+ return result
+
+
+def run_requests(args):
+ # TODO: switch to using JsonLinePusher
+ file_worker = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
+ fileset_worker = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ )
+ for line in args.json_file:
+ request = json.loads(line.strip())
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ result = fileset_worker.process(request)
+ else:
+ result = file_worker.process(request)
+ print(json.dumps(result, sort_keys=True))
+
+
+def run_file_requests_backfill(args):
+ """
+ Special mode for persisting GROBID and pdfextract results to Kafka, but
+ printing ingest result to stdout.
+
+ Can be used to batch re-process known files.
+ """
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
+ worker = IngestFileWorker(
+ grobid_client=grobid_client,
+ sink=None,
+ grobid_sink=grobid_sink,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ try_spn2=False,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
+
+def run_spn_status(args):
+ worker = IngestFileWorker(
+ sink=None,
+ try_spn2=False,
+ )
+
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system")
+ resp.raise_for_status()
+ print(f"System status: {json.dumps(resp.json(), sort_keys=True)}")
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user")
+ resp.raise_for_status()
+ print(f"User status: {json.dumps(resp.json(), sort_keys=True)}")
+
+
+def run_api(args):
+ port = 8083
+ print("Listening on localhost:{}".format(port))
+ server = HTTPServer(("", port), IngestFileRequestHandler)
+ server.serve_forever()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
+ parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)")
+ subparsers = parser.add_subparsers()
+
+ sub_single = subparsers.add_parser("single", help="ingests a single base URL")
+ sub_single.set_defaults(func=run_single_ingest)
+ sub_single.add_argument(
+ "ingest_type", default="pdf", help="type of ingest (pdf, html, etc)"
+ )
+ sub_single.add_argument(
+ "--release-id", help="(optional) existing release ident to match to"
+ )
+ sub_single.add_argument("--doi", help="(optional) existing release DOI to match to")
+ sub_single.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="ignore GWB history and use SPNv2 to re-crawl",
+ )
+ sub_single.add_argument("--no-spn2", action="store_true", help="don't use live web (SPNv2)")
+ sub_single.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_single.add_argument("url", help="URL of paper to fetch")
+ sub_single.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_requests = subparsers.add_parser(
+ "requests", help="takes a series of ingest requests (JSON, per line) and runs each"
+ )
+ sub_requests.add_argument(
+ "--no-spn2", action="store_true", help="don't use live web (SPNv2)"
+ )
+ sub_requests.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_requests.set_defaults(func=run_requests)
+ sub_requests.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ sub_api = subparsers.add_parser(
+ "api", help="starts a simple HTTP server that processes ingest requests"
+ )
+ sub_api.set_defaults(func=run_api)
+ sub_api.add_argument("--port", help="HTTP port to listen on", default=8033, type=int)
+
+ sub_file_requests_backfill = subparsers.add_parser(
+ "file-requests-backfill",
+ help="starts a simple HTTP server that processes ingest requests",
+ )
+ sub_file_requests_backfill.set_defaults(func=run_file_requests_backfill)
+ sub_file_requests_backfill.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_file_requests_backfill.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ sub_file_requests_backfill.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_spn_status = subparsers.add_parser(
+ "spn-status", help="checks save-page-now v2 API status for bot user"
+ )
+ sub_spn_status.set_defaults(func=run_spn_status)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 0d33ec9..28d6397 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -1,15 +1,11 @@
#!/usr/bin/env python3
-
"""
KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
"""
-import sys
-import json
import argparse
-import datetime
+import sys
-from grobid2json import teixml2json
from sandcrawler import *
@@ -20,10 +16,13 @@ def run_extract_json(args):
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
wayback_client = WaybackClient()
if args.jobs > 1:
@@ -33,19 +32,22 @@ def run_extract_cdx(args):
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
- worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
pusher = CdxLinePusher(
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_extract_zipfile(args):
if args.jobs > 1:
print("multi-processing: {}".format(args.jobs), file=sys.stderr)
@@ -57,9 +59,10 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+
def run_single(args):
worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
- with open(args.pdf_file, 'rb') as pdf_file:
+ with open(args.pdf_file, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
worker.push_record(pdf_bytes)
worker.finish()
@@ -67,55 +70,59 @@ def run_single(args):
args.thumbnail_sink.finish()
-
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json',
- help="for each JSON line with CDX info, fetches PDF and does PDF extraction")
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does PDF extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx',
- help="for each CDX line, fetches PDF and does PDF extraction")
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does PDF extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
- help="opens zipfile, iterates over PDF files inside and does PDF extract for each")
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does PDF extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
- sub_single = subparsers.add_parser('single',
- help="opens single PDF and extracts it")
+ sub_single = subparsers.add_parser("single", help="opens single PDF and extracts it")
sub_single.set_defaults(func=run_single)
- sub_single.add_argument('pdf_file',
- help="single PDF to extract",
- type=str)
+ sub_single.add_argument("pdf_file", help="single PDF to extract", type=str)
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!", file=sys.stderr)
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.text_sink = None
@@ -123,17 +130,22 @@ def main():
if args.kafka_mode:
text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
- args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
- produce_topic=text_topic)
- args.thumbnail_sink = KafkaSink(kafka_hosts=args.kafka_hosts,
- produce_topic=thumbnail_topic)
- print("Running in kafka output mode, publishing to {} and {}\n".format(
- text_topic, thumbnail_topic), file=sys.stderr)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=text_topic)
+ args.thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic
+ )
+ print(
+ "Running in kafka output mode, publishing to {} and {}\n".format(
+ text_topic, thumbnail_topic
+ ),
+ file=sys.stderr,
+ )
else:
args.sink = None
args.thumbnail_sink = None
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index ec92afe..24b749d 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -1,18 +1,15 @@
#!/usr/bin/env python3
-
"""
Basically just a copy of grobid_tool.py, but for PDF classification instead of
text extraction.
Example of large parallel run, locally:
-cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
-import sys
-import json
import argparse
-import datetime
+import sys
from sandcrawler import *
@@ -21,37 +18,47 @@ def run_classify_pdf_json(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_classify_pdf_cdx(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = CdxLinePusher(
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
pusher = CdxLinePusher(
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_classify_pdf_zipfile(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
@@ -60,62 +67,73 @@ def run_classify_pdf_zipfile(args):
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--pdftrio-host',
- default="http://pdftrio.qa.fatcat.wiki",
- help="pdftrio API host/port")
- parser.add_argument('--pdftrio-mode',
- default="auto",
- help="which classification mode to use")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--pdftrio-host", default="http://pdftrio.qa.fatcat.wiki", help="pdftrio API host/port"
+ )
+ parser.add_argument(
+ "--pdftrio-mode", default="auto", help="which classification mode to use"
+ )
subparsers = parser.add_subparsers()
- sub_classify_pdf_json = subparsers.add_parser('classify-pdf-json',
- help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_json = subparsers.add_parser(
+ "classify-pdf-json",
+ help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion",
+ )
sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json)
- sub_classify_pdf_json.add_argument('json_file',
+ sub_classify_pdf_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_classify_pdf_cdx = subparsers.add_parser('classify-pdf-cdx',
- help="for each CDX line, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_cdx = subparsers.add_parser(
+ "classify-pdf-cdx",
+ help="for each CDX line, fetches PDF and does pdftrio classify_pdfion",
+ )
sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx)
- sub_classify_pdf_cdx.add_argument('cdx_file',
+ sub_classify_pdf_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_classify_pdf_zipfile = subparsers.add_parser('classify-pdf-zipfile',
- help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each")
+ sub_classify_pdf_zipfile = subparsers.add_parser(
+ "classify-pdf-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each",
+ )
sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile)
- sub_classify_pdf_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to classify",
- type=str)
+ sub_classify_pdf_zipfile.add_argument(
+ "zip_file", help="zipfile with PDFs to classify", type=str
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.sink = None
if args.kafka_mode:
produce_topic = "sandcrawler-{}.pdftrio-output".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 66e02aa..e08d66c 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,15 +1,14 @@
#!/usr/bin/env python3
-
"""
-Commands for backfilling content from bulk files into postgresql and s3 (minio).
+Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
"""
+import argparse
import os
import sys
-import argparse
from sandcrawler import *
from sandcrawler.persist import *
@@ -19,7 +18,7 @@ def run_cdx(args):
worker = PersistCdxWorker(
db_url=args.db_url,
)
- filter_mimetypes = ['application/pdf']
+ filter_mimetypes = ["application/pdf"]
if args.no_mimetype_filter:
filter_mimetypes = None
pusher = CdxLinePusher(
@@ -27,11 +26,12 @@ def run_cdx(args):
args.cdx_file,
filter_http_statuses=[200, 226],
filter_mimetypes=filter_mimetypes,
- #allow_octet_stream
+ # allow_octet_stream
batch_size=200,
)
pusher.run()
+
def run_grobid(args):
worker = PersistGrobidWorker(
db_url=args.db_url,
@@ -49,6 +49,7 @@ def run_grobid(args):
)
pusher.run()
+
def run_grobid_disk(args):
"""
Writes XML to individual files on disk, and also prints non-XML metadata to
@@ -63,6 +64,7 @@ def run_grobid_disk(args):
)
pusher.run()
+
def run_pdftrio(args):
worker = PersistPdfTrioWorker(
db_url=args.db_url,
@@ -74,6 +76,7 @@ def run_pdftrio(args):
)
pusher.run()
+
def run_pdftext(args):
worker = PersistPdfTextWorker(
db_url=args.db_url,
@@ -91,6 +94,7 @@ def run_pdftext(args):
)
pusher.run()
+
def run_ingest_file_result(args):
worker = PersistIngestFileResultWorker(
db_url=args.db_url,
@@ -102,6 +106,7 @@ def run_ingest_file_result(args):
)
pusher.run()
+
def run_ingest_request(args):
worker = PersistIngestRequestWorker(
db_url=args.db_url,
@@ -113,92 +118,186 @@ def run_ingest_request(args):
)
pusher.run()
+
+def run_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
+def run_grobid_refs(args):
+ worker = PersistGrobidRefsWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--db-url',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--db-url",
help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (minio) backend URL",
- default="localhost:9000")
- parser.add_argument('--s3-access-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_ACCESS_KEY'))
- parser.add_argument('--s3-secret-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_SECRET_KEY'))
- parser.add_argument('--s3-bucket',
- help="S3 (minio) bucket to persist into",
- default="sandcrawler-dev")
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
subparsers = parser.add_subparsers()
- sub_cdx = subparsers.add_parser('cdx',
- help="backfill a CDX file into postgresql cdx table")
+ sub_cdx = subparsers.add_parser("cdx", help="backfill a CDX file into postgresql cdx table")
sub_cdx.set_defaults(func=run_cdx)
- sub_cdx.add_argument('cdx_file',
+ sub_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_cdx.add_argument('--no-mimetype-filter',
- action='store_true',
- help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
+ type=argparse.FileType("r"),
+ )
+ sub_cdx.add_argument(
+ "--no-mimetype-filter",
+ action="store_true",
+ help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)",
+ )
- sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)")
+ sub_grobid = subparsers.add_parser(
+ "grobid", help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_grobid.set_defaults(func=run_grobid)
- sub_grobid.add_argument('json_file',
+ sub_grobid.add_argument(
+ "json_file",
help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_grobid.add_argument('--db-only',
- action='store_true',
- help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
-
- sub_pdftext = subparsers.add_parser('pdftext',
- help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ type=argparse.FileType("r"),
+ )
+ sub_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_pdftext = subparsers.add_parser(
+ "pdftext", help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_pdftext.set_defaults(func=run_pdftext)
- sub_pdftext.add_argument('json_file',
+ sub_pdftext.add_argument(
+ "json_file",
help="pdftext file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_pdftext.add_argument('--db-only',
- action='store_true',
- help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
-
- sub_grobid_disk = subparsers.add_parser('grobid-disk',
- help="dump GRBOID output to (local) files on disk")
+ type=argparse.FileType("r"),
+ )
+ sub_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_grobid_disk = subparsers.add_parser(
+ "grobid-disk", help="dump GRBOID output to (local) files on disk"
+ )
sub_grobid_disk.set_defaults(func=run_grobid_disk)
- sub_grobid_disk.add_argument('json_file',
+ sub_grobid_disk.add_argument(
+ "json_file",
help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid_disk.add_argument('output_dir',
- help="base directory to output into",
- type=str)
+ type=argparse.FileType("r"),
+ )
+ sub_grobid_disk.add_argument("output_dir", help="base directory to output into", type=str)
- sub_pdftrio = subparsers.add_parser('pdftrio',
- help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)")
+ sub_pdftrio = subparsers.add_parser(
+ "pdftrio", help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_pdftrio.set_defaults(func=run_pdftrio)
- sub_pdftrio.add_argument('json_file',
+ sub_pdftrio.add_argument(
+ "json_file",
help="pdftrio file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
- help="backfill a ingest_file_result JSON dump into postgresql")
+ sub_ingest_file_result = subparsers.add_parser(
+ "ingest-file-result", help="backfill a ingest_file_result JSON dump into postgresql"
+ )
sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
- sub_ingest_file_result.add_argument('json_file',
+ sub_ingest_file_result.add_argument(
+ "json_file",
help="ingest_file_result file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_ingest_request = subparsers.add_parser('ingest-request',
- help="backfill a ingest_request JSON dump into postgresql")
+ sub_ingest_request = subparsers.add_parser(
+ "ingest-request", help="backfill a ingest_request JSON dump into postgresql"
+ )
sub_ingest_request.set_defaults(func=run_ingest_request)
- sub_ingest_request.add_argument('json_file',
+ sub_ingest_request.add_argument(
+ "json_file",
help="ingest_request to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
+
+ sub_crossref = subparsers.add_parser(
+ "crossref",
+ help="backfill a crossref JSON dump into postgresql, and extract references at the same time",
+ )
+ sub_crossref.set_defaults(func=run_crossref)
+ sub_crossref.add_argument(
+ "json_file",
+ help="crossref file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+
+ sub_grobid_refs = subparsers.add_parser(
+ "grobid-refs", help="backfill a grobid_refs JSON dump into postgresql"
+ )
+ sub_grobid_refs.set_defaults(func=run_grobid_refs)
+ sub_grobid_refs.add_argument(
+ "json_file",
+ help="grobid_refs to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -207,5 +306,6 @@ def main():
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..2cef007
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.isort]
+profile = "black"
+line_length = 96
diff --git a/python/pytest.ini b/python/pytest.ini
index 65f81da..18e8cf0 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -1,7 +1,5 @@
[pytest]
-ignore = setup.py
-
# allow imports from files in current directory
python_paths = .
@@ -18,5 +16,11 @@ filterwarnings =
ignore::DeprecationWarning:.*urllib3
ignore::DeprecationWarning:.*wayback
ignore::DeprecationWarning:.*PIL
+ ignore::DeprecationWarning:.*justext
+ ignore::DeprecationWarning:.*internetarchive
+ ignore::DeprecationWarning:.*minio
+ ignore::DeprecationWarning:.*base_reporter
+ ignore::DeprecationWarning:.*loccache
+ ignore:.*pytz-deprecation-shim
log_level = INFO
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 71c2023..469c2a2 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,10 +1,49 @@
-
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
-from .ingest import IngestFileWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (
+ CdxApiClient,
+ CdxApiError,
+ CdxPartial,
+ CdxRow,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WarcResource,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .ingest_file import IngestFileWorker
+from .ingest_fileset import IngestFilesetWorker
+from .misc import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_datetime,
+ parse_cdx_line,
+)
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (
+ PersistCdxWorker,
+ PersistGrobidDiskWorker,
+ PersistGrobidWorker,
+ PersistIngestFileResultWorker,
+ PersistIngestRequestWorker,
+ PersistPdfTextWorker,
+ PersistPdfTrioWorker,
+ PersistThumbnailWorker,
+)
+from .workers import (
+ BlackholeSink,
+ CdxLinePusher,
+ JsonLinePusher,
+ KafkaCompressSink,
+ KafkaJsonPusher,
+ KafkaSink,
+ MultiprocessWrapper,
+ ZipfilePusher,
+)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 793f1c4..f9018ec 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,87 +1,161 @@
-
-import json
import datetime
+import json
+from typing import Any, Dict, List, Optional, Tuple
import psycopg2
import psycopg2.extras
-import requests
-class SandcrawlerPostgrestClient:
+from .misc import requests_retry_session
- def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs):
+
+class SandcrawlerPostgrestClient:
+ def __init__(self, api_url: str = "http://wbgrp-svc506.us.archive.org:3030", **kwargs):
self.api_url = api_url
+ self.http_session = requests_retry_session()
- def get_cdx(self, url):
- resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
+ def get_cdx(self, url: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/cdx", params=dict(url="eq." + url))
resp.raise_for_status()
return resp.json() or None
- def get_grobid(self, sha1):
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+ def get_grobid(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_pdftrio(self, sha1):
- resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.'+sha1))
+ def get_pdftrio(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_pdf_meta(self, sha1):
- resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_pdf_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_file_meta(self, sha1):
- resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_html_meta(self, sha1hex: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/html_meta",
+ params=dict(sha1hex=f"eq.{sha1hex}"),
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_ingest_file_result(self, url):
- resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url))
+ def get_file_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
-class SandcrawlerPostgresClient:
+ def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/ingest_file_result",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
- def __init__(self, db_url, **kwargs):
+ def get_ingest_fileset_platform(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/ingest_fileset_platform",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/crossref", params=dict(doi=f"eq.{doi}"))
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref_with_refs(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/crossref_with_refs", params=dict(doi=f"eq.{doi}")
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_grobid_refs(self, source: str, source_id: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid_refs",
+ params=dict(source=f"eq.{source}", source_id=f"eq.{source_id}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+
+class SandcrawlerPostgresClient:
+ def __init__(self, db_url: str, **kwargs):
self.conn = psycopg2.connect(db_url)
- def cursor(self):
+ def cursor(self) -> psycopg2.extensions.cursor:
return self.conn.cursor()
- def commit(self):
- return self.conn.commit()
+ def commit(self) -> None:
+ self.conn.commit()
- def _inserts_and_updates(self, resp, on_conflict):
- resp = [int(r[0]) for r in resp]
- inserts = len([r for r in resp if r == 0])
+ def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]:
+ resp_codes = [int(r[0]) for r in resp]
+ inserts = len([r for r in resp_codes if r == 0])
if on_conflict == "update":
- updates = len([r for r in resp if r != 0])
+ updates = len([r for r in resp_codes if r != 0])
else:
updates = 0
return (inserts, updates)
- def insert_cdx(self, cur, batch, on_conflict="nothing"):
+ def insert_cdx(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
@@ -94,26 +168,35 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d for d in batch if d.get('warc_path')]
+ batch = [d for d in batch if d.get("warc_path")]
if not batch:
return (0, 0)
- batch = [(d['url'],
- d['datetime'],
- d['sha1hex'],
- d['mimetype'],
- d['warc_path'],
- int(d['warc_csize']),
- int(d['warc_offset']))
- for d in batch]
+ rows = [
+ (
+ d["url"],
+ d["datetime"],
+ d["sha1hex"],
+ d["mimetype"],
+ d["warc_path"],
+ int(d["warc_csize"]),
+ int(d["warc_offset"]),
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (url, datetime)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_file_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_file_meta(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
@@ -132,21 +215,24 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [(d['sha1hex'],
- d['sha256hex'],
- d['md5hex'],
- int(d['size_bytes']),
- d['mimetype'])
- for d in batch]
+ rows = [
+ (d["sha1hex"], d["sha256hex"], d["md5hex"], int(d["size_bytes"]), d["mimetype"])
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_grobid(self, cur, batch, on_conflict="nothing"):
+ def insert_grobid(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
@@ -168,33 +254,39 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
for r in batch:
- if r.get('metadata'):
+ if r.get("metadata"):
# sometimes these are only in metadata; shouldn't pass through
# though (to save database space)
- dupe_fields = ('fatcat_release', 'grobid_version')
+ dupe_fields = ("fatcat_release", "grobid_version")
for k in dupe_fields:
- if not k in r:
- r[k] = r['metadata'].get(k)
- r['metadata'].pop(k, None)
- r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
- batch = [(d['key'],
- d.get('grobid_version') or None,
- d['status_code'],
- d['status'],
- d.get('fatcat_release') or None,
- d.get('updated') or datetime.datetime.now(),
- d.get('metadata') or None ,
- )
- for d in batch]
+ if k not in r:
+ r[k] = r["metadata"].get(k)
+ r["metadata"].pop(k, None)
+ r["metadata"] = json.dumps(r["metadata"], sort_keys=True)
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["key"],
+ d.get("grobid_version") or None,
+ d["status_code"],
+ d["status"],
+ d.get("fatcat_release") or None,
+ d.get("updated") or now,
+ d.get("metadata") or None,
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_pdf_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_pdf_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -223,16 +315,56 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d.to_sql_tuple() for d in batch]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_html_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ scope=EXCLUDED.scope,
+ has_teixml=EXCLUDED.has_teixml,
+ has_thumbnail=EXCLUDED.has_thumbnail,
+ word_count=EXCLUDED.word_count,
+ biblio=EXCLUDED.biblio,
+ resources=EXCLUDED.resources
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_pdftrio(self, cur, batch, on_conflict="nothing"):
+ def insert_pdftrio(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
@@ -258,29 +390,36 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [
+ now = datetime.datetime.now()
+ rows = [
(
- d['key'],
- d.get('updated') or datetime.datetime.now(),
- d['status_code'],
- d['status'],
- d.get('versions', {}).get('pdftrio_version') or None,
- d.get('versions', {}).get('models_date') or None,
- d.get('ensemble_score'),
- d.get('bert_score'),
- d.get('linear_score'),
- d.get('image_score'),
+ d["key"],
+ d.get("updated") or now,
+ d["status_code"],
+ d["status"],
+ d.get("versions", {}).get("pdftrio_version") or None,
+ d.get("versions", {}).get("models_date") or None,
+ d.get("ensemble_score"),
+ d.get("bert_score"),
+ d.get("linear_score"),
+ d.get("image_score"),
)
- for d in batch]
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_ingest_request(self, cur, batch, on_conflict="nothing"):
+ def insert_ingest_request(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
@@ -294,35 +433,43 @@ class SandcrawlerPostgresClient:
sql += " RETURNING xmax;"
for r in batch:
# in case these fields were already packed into 'request'
- extra = r.get('request', {})
- for k in ('ext_ids', 'fatcat_release', 'edit_extra', 'rel'):
+ extra = r.get("request", {})
+ for k in ("ext_ids", "fatcat_release", "edit_extra", "rel"):
if r.get(k):
extra[k] = r[k]
if extra:
- r['extra'] = json.dumps(extra, sort_keys=True)
- batch = [(d['link_source'],
- d['link_source_id'],
- d['ingest_type'],
- d['base_url'],
- d.get('ingest_request_source'),
- d.get('release_stage') or None,
- d.get('extra') or None,
- )
- for d in batch]
+ r["extra"] = json.dumps(extra, sort_keys=True)
+ rows = [
+ (
+ d["link_source"],
+ d["link_source_id"],
+ d["ingest_type"],
+ d["base_url"],
+ d.get("ingest_request_source"),
+ d.get("release_stage") or None,
+ d.get("extra") or None,
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1], b[2], b[3])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1], b[2], b[3])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_ingest_file_result(self, cur, batch, on_conflict="nothing"):
+ def insert_ingest_file_result(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
VALUES %s
- ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
+ ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
"""
if on_conflict.lower() == "nothing":
sql += " NOTHING"
@@ -339,20 +486,165 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [(d['ingest_type'],
- d['base_url'],
- bool(d['hit']),
- d['status'],
- d.get('terminal_url'),
- d.get('terminal_dt'),
- d.get('terminal_status_code'),
- d.get('terminal_sha1hex'),
- )
- for d in batch]
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("terminal_url"),
+ d.get("terminal_dt"),
+ d.get("terminal_status_code"),
+ d.get("terminal_sha1hex"),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_fileset_platform(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_fileset_platform_pkeypkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=now(),
+ hit=EXCLUDED.hit,
+ status=EXCLUDED.status,
+ platform_name=EXCLUDED.platform_name,
+ platform_domain=EXCLUDED.platform_domain,
+ platform_id=EXCLUDED.platform_id,
+ ingest_strategy=EXCLUDED.ingest_strategy,
+ total_size=EXCLUDED.total_size,
+ file_count=EXCLUDED.file_count,
+ archiveorg_item_name=EXCLUDED.archiveorg_item_name,
+ archiveorg_item_bundle_path=EXCLUDED.archiveorg_item_bundle_path,
+ web_bundle_url=EXCLUDED.web_bundle_url,
+ web_bundle_dt=EXCLUDED.web_bundle_dt,
+ manifest=EXCLUDED.manifest
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("platform_name"),
+ d.get("platform_domain"),
+ d.get("platform_id"),
+ d.get("ingest_strategy"),
+ d.get("total_size"),
+ d.get("file_count"),
+ d.get("archiveorg_item_name"),
+ d.get("archiveorg_item_bundle_path"),
+ d.get("web_bundle_url"),
+ d.get("web_bundle_dt"),
+ d.get("manifest"),
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (ingest_type, base_url)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_crossref(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ crossref (doi, indexed, record)
+ VALUES %s
+ ON CONFLICT (doi) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ indexed=EXCLUDED.indexed,
+ record=EXCLUDED.record
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["doi"],
+ d.get("indexed") or None,
+ json.dumps(d["record"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_grobid_refs(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ grobid_refs (source, source_id, source_ts, updated, refs_json)
+ VALUES %s
+ ON CONFLICT (source, source_id) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ source_ts=EXCLUDED.source_ts,
+ updated=EXCLUDED.updated,
+ refs_json=EXCLUDED.refs_json
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["source"],
+ d["source_id"],
+ d.get("source_ts") or None,
+ d.get("updated") or now,
+ json.dumps(d["refs_json"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
new file mode 100644
index 0000000..5c13318
--- /dev/null
+++ b/python/sandcrawler/fileset_platforms.py
@@ -0,0 +1,832 @@
+import urllib.parse
+from typing import Optional, Tuple
+
+import internetarchive
+
+from sandcrawler.fileset_types import (
+ FilesetManifestFile,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+from sandcrawler.misc import requests_retry_session
+
+
+class FilesetPlatformHelper:
+ def __init__(self):
+ self.platform_name = "unknown"
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ """
+ Does this request look like it matches this platform?
+ """
+ raise NotImplementedError()
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+ raise NotImplementedError()
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ assert item.manifest
+ total_size = sum([m.size or 0 for m in item.manifest]) or 0
+ largest_size = max([m.size or 0 for m in item.manifest]) or 0
+ if len(item.manifest) == 1:
+ if total_size < 64 * 1024 * 1024:
+ return IngestStrategy.WebFile
+ else:
+ return IngestStrategy.ArchiveorgFile
+ else:
+ if largest_size < 64 * 1024 * 1024 and total_size < 128 * 1024 * 1024 * 1024:
+ return IngestStrategy.WebFileset
+ else:
+ return IngestStrategy.ArchiveorgFileset
+
+
+class DataverseHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "dataverse"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_dataverse_persistentid(pid: str) -> dict:
+ """
+ Parses a persistentId into 5 sections:
+
+ - type (doi or hdl)
+ - authority (eg, DOI prefix)
+ - shoulder (optional, eg 'DVN')
+ - dataset_id (6-digit)
+ - file_id
+
+ The returned dict always has all components, which may be 'None' if optional.
+
+ This is possible because the dataverse software only supports a handful
+ of configurations and persistend identifier types.
+
+ If there is an error parsing, raises a ValueError
+ """
+ id_type = None
+ if pid.startswith("doi:10."):
+ id_type = "doi"
+ pid = pid[4:]
+ elif pid.startswith("hdl:"):
+ id_type = "hdl"
+ pid = pid[4:]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ comp = pid.split("/")
+ if len(comp) < 2:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ authority = comp[0]
+ shoulder = None
+ dataset_id = None
+ file_id = None
+ if len(comp[1]) != 6 and len(comp) == 3:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ elif len(comp[1]) != 6 and len(comp) == 4:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ file_id = comp[3]
+ elif len(comp[1]) == 6 and len(comp) == 2:
+ dataset_id = comp[1]
+ elif len(comp[1]) == 6 and len(comp) == 3:
+ dataset_id = comp[1]
+ file_id = comp[2]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ if len(dataset_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse dataset id: {dataset_id}")
+ if file_id and len(file_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse file id: {file_id}")
+
+ return {
+ "type": id_type,
+ "authority": authority,
+ "shoulder": shoulder,
+ "dataset_id": dataset_id,
+ "file_id": file_id,
+ }
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: could also do HTML platform detection or something?
+
+ components = urllib.parse.urlparse(url)
+ # platform_domain = components.netloc.split(':')[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not id_param:
+ return False
+ platform_id = id_param[0]
+
+ try:
+ self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ return False
+
+ return True
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+
+
+ HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not (id_param and id_param[0]):
+ raise PlatformScopeError("Expected a Dataverse persistentId in URL")
+ platform_id = id_param[0]
+ version_param = params.get("version")
+ dataset_version = None
+ if version_param:
+ dataset_version = version_param[0]
+
+ try:
+ parsed_id = self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ raise PlatformScopeError("not actually in scope")
+
+ if parsed_id["file_id"]:
+ # TODO: maybe we could support this?
+ raise PlatformScopeError(
+ "only entire dataverse datasets can be archived with this tool"
+ )
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ if not dataset_version:
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+ if "latestVersion" not in obj["data"]:
+ raise PlatformScopeError("could not find latest version for dataverse record")
+ obj_latest = obj["data"]["latestVersion"]
+ dataset_version = (
+ f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ obj_latest = obj["data"]["latestVersion"]
+ assert (
+ dataset_version
+ == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+ assert platform_id == obj_latest["datasetPersistentId"]
+
+ manifest = []
+ for row in obj_latest["files"]:
+ df = row["dataFile"]
+ df_persistent_id = df["persistentId"]
+ platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}"
+ if df.get("originalFileName"):
+ platform_url += "&format=original"
+
+ extra = dict()
+ # TODO: always save the version field?
+ if row.get("version") != 1:
+ extra["version"] = row["version"]
+ if "description" in df:
+ extra["description"] = df["description"]
+ manifest.append(
+ FilesetManifestFile(
+ path=df.get("originalFileName") or df["filename"],
+ size=df.get("originalFileSize") or df["filesize"],
+ md5=df["md5"],
+ # NOTE: don't get: sha1, sha256
+ mimetype=df["contentType"],
+ platform_url=platform_url,
+ extra=extra or None,
+ )
+ )
+
+ platform_sub_id = platform_id.split("/")[-1]
+ archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ date=obj_latest["releaseTime"].split("T")[0],
+ source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
+ )
+ if platform_id.startswith("doi:10."):
+ archiveorg_item_meta["doi"] = platform_id.replace("doi:", "")
+ for block in obj_latest["metadataBlocks"]["citation"]["fields"]:
+ if block["typeName"] == "title":
+ archiveorg_item_meta["title"] = block["value"]
+ elif block["typeName"] == "depositor":
+ archiveorg_item_meta["creator"] = block["value"]
+ elif block["typeName"] == "dsDescription":
+ archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][
+ "value"
+ ]
+
+ archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "")
+ if obj_latest.get("termsOfUse"):
+ archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"]
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_dataverse_persistentid() -> None:
+
+ valid = {
+ "doi:10.25625/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.25625",
+ "shoulder": None,
+ "dataset_id": "LL6WXZ",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": "LL6WXZ",
+ },
+ "hdl:20.500.12690/RIN/IDDOAH/BTNH25": {
+ "type": "hdl",
+ "authority": "20.500.12690",
+ "shoulder": "RIN",
+ "dataset_id": "IDDOAH",
+ "file_id": "BTNH25",
+ },
+ "doi:10.7910/DVN/6HPRIG": {
+ "type": "doi",
+ "authority": "10.7910",
+ "shoulder": "DVN",
+ "dataset_id": "6HPRIG",
+ "file_id": None,
+ },
+ }
+
+ invalid = [
+ # "doi:10.5072/FK2/J8SJZB/LL6WXZ",
+ "doi:10.25625/abcd",
+ "other:10.25625/LL6WXZ",
+ "10.25625/LL6WXZ",
+ "doi:10.5072/FK2/J8SJZB/LL6WXZv123",
+ ]
+
+ for pid, val in valid.items():
+ assert DataverseHelper.parse_dataverse_persistentid(pid) == val
+
+ for pid in invalid:
+ try:
+ DataverseHelper.parse_dataverse_persistentid(pid)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class FigshareHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "figshare"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_figshare_url_path(path: str) -> Tuple[str, Optional[str]]:
+ """
+ Tries to parse a figshare URL into ID number and (optional) version number.
+
+ Returns a two-element tuple; version number will be None if not found
+
+ Raises a ValueError if not a figshare URL
+ """
+ # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+ # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
+
+ comp = path.split("/")
+ if len(comp) < 4 or comp[1] != "articles":
+ raise ValueError(f"not a figshare URL: {path}")
+
+ comp = comp[2:]
+ if comp[0] in [
+ "dataset",
+ # TODO: should the following be considered "out of scope"?
+ "journal_contribution",
+ "presentation",
+ "poster",
+ "thesis",
+ ]:
+ comp = comp[1:]
+
+ if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
+ return (comp[1], comp[2])
+ elif len(comp) == 2 and comp[1].isdigit():
+ return (comp[1], None)
+ else:
+ raise ValueError(f"couldn't find figshare identiier: {path}")
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ # only work with full, versioned figshare.com URLs
+ if "figshare.com" not in platform_domain:
+ return False
+
+ try:
+ parsed = self.parse_figshare_url_path(components.path)
+ except ValueError:
+ return False
+
+ # has file component
+ if parsed[0] and parsed[1]:
+ return True
+
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ (platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+ assert (
+ dataset_version and dataset_version.isdigit()
+ ), f"expected numeric: {dataset_version}"
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ # TODO: implement this code path
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ # figshare_type = obj['defined_type_name']
+
+ if not obj["is_public"]:
+ raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}")
+ if obj["is_embargoed"]:
+ raise PlatformRestrictedError(
+ f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})'
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ manifest.append(
+ FilesetManifestFile(
+ path=row["name"],
+ size=row["size"],
+ md5=row["computed_md5"],
+ # NOTE: don't get: sha1, sha256, mimetype
+ platform_url=row["download_url"],
+ # extra=dict(),
+ )
+ )
+ if row.get("is_link_only"):
+ raise PlatformScopeError(
+ f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}"
+ )
+
+ authors = []
+ for author in obj["authors"]:
+ authors.append(author["full_name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["title"],
+ date=obj["published_date"],
+ source=obj["url_public_html"],
+ description=obj["description"],
+ license=obj["license"]["url"],
+ version=obj["version"],
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_figshare_url_path() -> None:
+
+ valid = {
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": (
+ "8987858",
+ "1",
+ ),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": (
+ "8987858",
+ None,
+ ),
+ "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+ "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": (
+ "12127176",
+ "4",
+ ),
+ "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": (
+ "1407386",
+ "3",
+ ),
+ "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": (
+ "1094056",
+ "1",
+ ),
+ }
+
+ invalid = [
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species",
+ ]
+
+ for path, val in valid.items():
+ assert FigshareHelper.parse_figshare_url_path(path) == val
+
+ for path in invalid:
+ try:
+ FigshareHelper.parse_figshare_url_path(path)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class ZenodoHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "zenodo"
+ self.session = requests_retry_session()
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if platform_domain == "zenodo.org" and "/record/" in components.path:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: also look in base_url and resource-non-terminal for ident? to
+ # check for work-level redirects
+
+ # 1. extract identifier from URL
+ # eg: https://zenodo.org/record/5230255
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if len(components.path.split("/")) < 2:
+ raise PlatformScopeError("Expected a complete, versioned figshare URL")
+
+ platform_id = components.path.split("/")[2]
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+
+ if "zenodo.org" not in platform_domain:
+ raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
+
+ # 2. API fetch
+ resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}", timeout=60.0)
+ if resp.status_code == 410:
+ raise PlatformRestrictedError("record deleted")
+ resp.raise_for_status()
+ obj = resp.json()
+
+ assert obj["id"] == int(platform_id)
+ work_id = obj["conceptrecid"]
+ if work_id == obj["id"]:
+ raise PlatformScopeError(
+ "got a work-level zenodo record, not a versioned record: {work_id}"
+ )
+
+ # zenodo_type = obj['metadata']['resource_type']['type']
+
+ if obj["metadata"]["access_right"] != "open":
+ raise PlatformRestrictedError(
+ "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}"
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ mf = FilesetManifestFile(
+ path=row["key"],
+ size=row["size"],
+ platform_url=row["links"]["self"],
+ # extra=dict(),
+ )
+ checksum = row["checksum"]
+ # eg: md5:35ffcab905f8224556dba76648cb7dad
+ if checksum.startswith("md5:"):
+ mf.md5 = checksum[4:]
+ elif checksum.startswith("sha1:"):
+ mf.sha1 = checksum[45]
+ manifest.append(mf)
+
+ authors = []
+ for author in obj["metadata"]["creators"]:
+ authors.append(author["name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["metadata"]["title"],
+ date=obj["metadata"]["publication_date"],
+ source=obj["links"]["html"],
+ description=obj["metadata"]["description"],
+ license=obj["metadata"]["license"]["id"],
+ version=obj["revision"],
+ # obj['metadata']['version'] is, eg, git version tag
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=obj["revision"]),
+ )
+
+
+class ArchiveOrgHelper(FilesetPlatformHelper):
+
+ FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "archiveorg"
+ self.session = internetarchive.get_session()
+
+ @staticmethod
+ def want_item_file(f: internetarchive.File, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in [
+ "_academictorrents.torrent",
+ "_academictorrents_torrent.txt",
+ ".bib",
+ ]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+ patterns = [
+ "://archive.org/details/",
+ "://archive.org/download/",
+ ]
+ for p in patterns:
+ if p in url:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ base_url_split = request["base_url"].split("/")
+ # print(base_url_split, file=sys.stderr)
+ assert len(base_url_split) in [5, 6]
+ assert base_url_split[0] in ["http:", "https:"]
+ assert base_url_split[2] == "archive.org"
+ assert base_url_split[3] in ["details", "download"]
+ item_name = base_url_split[4]
+ if len(base_url_split) == 6 and base_url_split[5]:
+ raise PlatformScopeError(
+ "got an archive.org file path, not download/details page; individual files not handled yet"
+ )
+
+ # print(f" archiveorg processing item={item_name}", file=sys.stderr)
+ item = self.session.get_item(item_name)
+ item_name = item.identifier
+ item_collection = item.metadata["collection"]
+ if type(item_collection) == list:
+ item_collection = item_collection[0]
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ item_files = [f for f in item_files if self.want_item_file(f, item_name)]
+ manifest = []
+ for f in item_files:
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = FilesetManifestFile(
+ path=f.name,
+ size=int(f.size),
+ sha1=f.sha1,
+ md5=f.md5,
+ mimetype=self.FORMAT_TO_MIMETYPE[f.format],
+ platform_url=f"https://archive.org/download/{item_name}/{f.name}",
+ )
+ manifest.append(mf)
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain="archive.org",
+ platform_id=item_name,
+ archiveorg_item_name=item_name,
+ archiveorg_meta=dict(collection=item_collection),
+ )
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ """
+ Don't use default strategy picker; we are always doing an 'existing' in this case.
+ """
+ assert item.manifest is not None
+ if len(item.manifest) == 1:
+ # NOTE: code flow does not support ArchiveorgFilesetBundle for the
+ # case of, eg, a single zipfile in an archive.org item
+ return IngestStrategy.ArchiveorgFile
+ elif len(item.manifest) >= 1:
+ return IngestStrategy.ArchiveorgFileset
+ else:
+ raise NotImplementedError("empty dataset")
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
new file mode 100644
index 0000000..1d84ce5
--- /dev/null
+++ b/python/sandcrawler/fileset_strategies.py
@@ -0,0 +1,387 @@
+import os
+import shutil
+import sys
+from typing import Optional
+
+import internetarchive
+import requests
+
+from sandcrawler.fileset_types import (
+ ArchiveStrategyResult,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformScopeError,
+)
+from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
+from sandcrawler.misc import (
+ gen_file_metadata,
+ gen_file_metadata_path,
+ requests_retry_session,
+ sanitize_fs_path,
+)
+
+
+class FilesetIngestStrategy:
+ def __init__(self):
+ # self.ingest_strategy = 'unknown'
+ self.success_status = "success"
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ raise NotImplementedError()
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ raise NotImplementedError()
+
+
+class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+
+ # TODO: enable cleanup when confident (eg, safe path parsing)
+ self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True)
+ self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/")
+ try:
+ os.mkdir(self.working_dir)
+ except FileExistsError:
+ pass
+
+ self.http_session = requests_retry_session()
+ self.ia_session = internetarchive.get_session(
+ config={
+ "s3": {
+ "access": os.environ.get("IA_ACCESS_KEY"),
+ "secret": os.environ.get("IA_SECRET_KEY"),
+ },
+ }
+ )
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ """
+ use API to check for item with all the files in the manifest
+
+ NOTE: this naive comparison is quadratic in number of files, aka O(N^2)
+ """
+ ia_item = self.ia_session.get_item(item.archiveorg_item_name)
+ if not ia_item.exists:
+ return None
+ item_files = ia_item.get_files(on_the_fly=False)
+ assert item.manifest
+ for wanted in item.manifest:
+ found = False
+ for existing in item_files:
+ if existing.name == wanted.path:
+ if (
+ (
+ (existing.sha1 and existing.sha1 == wanted.sha1)
+ or (existing.md5 and existing.md5 == wanted.md5)
+ )
+ and existing.name == wanted.path
+ and existing.size == wanted.size
+ ):
+ found = True
+ wanted.status = "exists"
+ break
+ else:
+ wanted.status = "mismatch-existing"
+ break
+ if not found:
+ print(
+ f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
+ file=sys.stderr,
+ )
+ return None
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status="success-existing",
+ manifest=item.manifest,
+ )
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ May require extra context to pass along to archive.org item creation.
+ """
+ existing = self.check_existing(item)
+ if existing:
+ return existing
+
+ if item.platform_name == "archiveorg":
+ raise PlatformScopeError("shouldn't download archive.org into itself")
+
+ local_dir = self.working_dir + item.archiveorg_item_name
+ assert local_dir.startswith("/")
+ assert local_dir.count("/") > 2
+ try:
+ os.mkdir(local_dir)
+ except FileExistsError:
+ pass
+
+ # 1. download all files locally
+ assert item.manifest
+ for m in item.manifest:
+ if m.path != sanitize_fs_path(m.path):
+ m.status = "unsafe-path"
+ continue
+
+ local_path = local_dir + "/" + m.path
+ assert m.platform_url
+
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ if os.path.exists(local_path):
+ m.status = "exists-local"
+ else:
+ print(f" downloading {m.path}", file=sys.stderr)
+ # create any sub-directories for this path, if necessary
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ try:
+ with self.http_session.get(
+ m.platform_url,
+ stream=True,
+ allow_redirects=True,
+ timeout=2 * 60 * 60,
+ ) as r:
+ r.raise_for_status()
+ with open(local_path + ".partial", "wb") as f:
+ for chunk in r.iter_content(chunk_size=256 * 1024):
+ f.write(chunk)
+ os.rename(local_path + ".partial", local_path)
+ m.status = "downloaded-local"
+ except requests.exceptions.RequestException:
+ m.status = "error-platform-download"
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-platform-download",
+ )
+
+ print(f" verifying {m.path}", file=sys.stderr)
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ if file_meta["size_bytes"] != m.size:
+ print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr)
+ m.status = "mismatch-size"
+ continue
+
+ if m.sha1:
+ if file_meta["sha1hex"] != m.sha1:
+ m.status = "mismatch-sha1"
+ continue
+ else:
+ m.sha1 = file_meta["sha1hex"]
+
+ if m.sha256:
+ if file_meta["sha256hex"] != m.sha256:
+ m.status = "mismatch-sha256"
+ continue
+ else:
+ m.sha256 = file_meta["sha256hex"]
+
+ if m.md5:
+ if file_meta["md5hex"] != m.md5:
+ m.status = "mismatch-md5"
+ continue
+ else:
+ m.md5 = file_meta["md5hex"]
+
+ if m.mimetype:
+ # 'magic' isn't good and parsing more detailed text file formats like text/csv
+ if (
+ file_meta["mimetype"] != m.mimetype
+ and file_meta["mimetype"] != "text/plain"
+ ):
+ # these 'tab-separated-values' from dataverse are just noise, don't log them
+ if m.mimetype != "text/tab-separated-values":
+ print(
+ f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
+ file=sys.stderr,
+ )
+ m.mimetype = file_meta["mimetype"]
+ else:
+ m.mimetype = file_meta["mimetype"]
+ m.status = "verified-local"
+
+ # if verification failed for any individual files, bail out
+ for m in item.manifest:
+ if m.status != "verified-local":
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status=m.status,
+ )
+
+ # 2. upload all files, with metadata
+ assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
+ item_files = {}
+ for m in item.manifest:
+ local_path = local_dir + "/" + m.path
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
+
+ print(
+ f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
+ file=sys.stderr,
+ )
+ try:
+ internetarchive.upload(
+ item.archiveorg_item_name,
+ files=item_files,
+ metadata=item.archiveorg_item_meta,
+ checksum=True,
+ queue_derive=False,
+ verify=True,
+ )
+ except requests.exceptions.RequestException:
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-archiveorg-upload",
+ )
+
+ for m in item.manifest:
+ m.status = "success"
+
+ # 4. delete local directory
+ if not self.skip_cleanup_local_files:
+ shutil.rmtree(local_dir)
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=self.success_status,
+ manifest=item.manifest,
+ )
+
+ return result
+
+
+class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
+ """
+ ArchiveorgFilesetStrategy currently works fine with individual files. Just
+ need to over-ride the ingest_strategy name.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+ self.success_status = "success-file"
+
+
+class WebFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.WebFileset
+ self.wayback_client = WaybackClient()
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+ self.max_spn_manifest = 20
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
+
+ TODO:
+ - full fetch_resource() method which can do SPN requests
+ """
+
+ assert item.manifest
+ file_file_meta = None
+ file_resource = None
+ for m in item.manifest:
+ fetch_url = m.platform_url
+ if not fetch_url:
+ raise NotImplementedError(
+ "require 'platform_url' for each file when doing Web fetching"
+ )
+
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
+
+ if self.try_spn2 and (
+ resource is None or (resource and resource.status == "no-capture")
+ ):
+ if len(item.manifest) > self.max_spn_manifest:
+ m.status = "too-much-spn"
+ continue
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(
+ fetch_url, self.wayback_client, force_simple_get=True
+ )
+
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or fetch_url,
+ ),
+ file=sys.stderr,
+ )
+
+ m.terminal_url = resource.terminal_url
+ m.terminal_dt = resource.terminal_dt
+ m.status = resource.status
+ if self.ingest_strategy == "web-file":
+ file_resource = resource
+
+ if resource.status != "success":
+ continue
+ else:
+ assert resource.terminal_status_code == 200
+
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, _html_resource = fix_transfer_encoding(file_meta, resource)
+ except Exception:
+ m.status = "transfer-encoding-error"
+ continue
+
+ if self.ingest_strategy == "web-file":
+ file_file_meta = file_meta
+
+ if (
+ file_meta["size_bytes"] != m.size
+ or (m.md5 and m.md5 != file_meta["md5hex"])
+ or (m.sha1 and m.sha1 != file_meta["sha1hex"])
+ ):
+ m.status = "mismatch"
+ continue
+
+ m.md5 = m.md5 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["sha1hex"]
+ m.sha256 = m.sha256 or file_meta["sha256hex"]
+ m.mimetype = m.mimetype or file_meta["mimetype"]
+
+ overall_status = self.success_status
+ for m in item.manifest:
+ if m.status != "success":
+ overall_status = m.status or "not-processed"
+ break
+ if not item.manifest:
+ overall_status = "empty-manifest"
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=overall_status,
+ manifest=item.manifest,
+ )
+ if self.ingest_strategy == "web-file":
+ result.file_file_meta = file_file_meta
+ result.file_resource = file_resource
+ return result
+
+
+class WebFileStrategy(WebFilesetStrategy):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.ingest_strategy = IngestStrategy.WebFile
+ self.success_status = "success-file"
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
new file mode 100644
index 0000000..3398833
--- /dev/null
+++ b/python/sandcrawler/fileset_types.py
@@ -0,0 +1,74 @@
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel
+
+
+class IngestStrategy(str, Enum):
+ WebFile = "web-file"
+ WebFileset = "web-fileset"
+ WebFilesetBundled = "web-fileset-bundled"
+ ArchiveorgFile = "archiveorg-file"
+ ArchiveorgFileset = "archiveorg-fileset"
+ ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
+
+
+class FilesetManifestFile(BaseModel):
+ path: str
+ size: Optional[int]
+ md5: Optional[str]
+ sha1: Optional[str]
+ sha256: Optional[str]
+ mimetype: Optional[str]
+ extra: Optional[Dict[str, Any]]
+
+ status: Optional[str]
+ platform_url: Optional[str]
+ terminal_url: Optional[str]
+ terminal_dt: Optional[str]
+
+
+class FilesetPlatformItem(BaseModel):
+ platform_name: str
+ platform_status: str
+ platform_domain: Optional[str]
+ platform_id: Optional[str]
+ manifest: Optional[List[FilesetManifestFile]]
+
+ archiveorg_item_name: Optional[str]
+ archiveorg_item_meta: Optional[dict]
+ web_base_url: Optional[str]
+ web_bundle_url: Optional[str]
+
+
+class ArchiveStrategyResult(BaseModel):
+ ingest_strategy: str
+ status: str
+ manifest: List[FilesetManifestFile]
+ file_file_meta: Optional[Dict[str, Any]]
+ file_resource: Optional[Any]
+ bundle_file_meta: Optional[Dict[str, Any]]
+ bundle_resource: Optional[Any]
+ bundle_archiveorg_path: Optional[str]
+
+
+class PlatformScopeError(Exception):
+ """
+ For incidents where platform helper discovers that the fileset/dataset is
+ out-of-cope after already starting to process it.
+
+ For example, attempting to ingest:
+
+ - a 'latest version' record, when the platform has version-specific records
+ - a single file within a dataset for a platform which has file-level identifiers
+ """
+
+ pass
+
+
+class PlatformRestrictedError(Exception):
+ """
+ When datasets are not publicly available on a platform (yet)
+ """
+
+ pass
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 11623c5..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,93 +1,301 @@
+import html
+import sys
+import time
+import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
import requests
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
-from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
-from .misc import gen_file_metadata
-from .ia import WaybackClient, WaybackError, PetaboxError
+from .ia import WaybackClient
+from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
+MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte
+
+
+def clean_crossref_unstructured(raw: str) -> str:
+ """
+ Applies Crossref-specific cleanups to an 'unstructured' citation string.
+ """
+
+ # detect repeated strings with double space separating them
+ subs = raw.split(" ")
+ if len(subs) == 2 and subs[0] == subs[1]:
+ raw = subs[0]
+ else:
+ raw = " ".join(subs)
+
+ # remove HTML/XML numeric characters
+ if "&#" in raw or "&amp;" in raw or "&gt;" in raw or "&lt;" in raw:
+ raw = html.unescape(raw)
+
+ raw.replace(" ", " ")
+ raw = raw.strip()
+ return raw
+
+
+def test_clean_ref_str() -> None:
+ # NOTE: this as emdash, non-breaking string characters in it
+ raw_with_nbsp = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394. Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ cleaned = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ assert clean_crossref_unstructured(raw_with_nbsp) == cleaned
+
+ # HTML escape characters
+ assert (
+ clean_crossref_unstructured(
+ "J-B Champion, C.Collin, INSEE Premi&#232;re N&#176;1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+ == "J-B Champion, C.Collin, INSEE Première N°1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+
+ # simple doubling
+ assert (
+ clean_crossref_unstructured("https://graph500.org/. https://graph500.org/.")
+ == "https://graph500.org/."
+ )
+ assert (
+ clean_crossref_unstructured(
+ """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg. Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+ == """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+
+ # all non-breaking whitespace
+ assert (
+ clean_crossref_unstructured(
+ "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+ )
+ == ""
+ )
-class GrobidClient(object):
- def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+class GrobidClient(object):
+ def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
- self.consolidate_mode = int(kwargs.get('consolidate_mode', 2))
+ self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
+ self.session = requests_retry_session()
- def process_fulltext(self, blob, consolidate_mode=None):
+ def process_fulltext(
+ self, blob: bytes, consolidate_mode: Optional[int] = None
+ ) -> Dict[str, Any]:
"""
Returns dict with keys:
- status_code
- status (slug)
- error_msg (if status == 'error')
- tei_xml (if status is 200)
-
- TODO: persist connection for performance?
"""
assert blob
- if consolidate_mode == None:
+ if len(blob) > MAX_GROBID_BLOB_SIZE:
+ return {
+ "status": "blob-too-large",
+ "error_msg": f"Not going to process very large file ({len(blob)} bytes)",
+ }
+
+ if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
+ assert consolidate_mode is not None
try:
- grobid_response = requests.post(
+ grobid_response = self.session.post(
self.host_url + "/api/processFulltextDocument",
files={
- 'input': blob,
- 'consolidateHeader': self.consolidate_mode,
- 'consolidateCitations': 0, # too expensive for now
- 'includeRawCitations': 1,
+ "input": blob,
+ },
+ data={
+ "consolidateHeader": consolidate_mode,
+ "consolidateCitations": 0, # too expensive for now
+ "includeRawCitations": 1,
+ "includeRawAffiliations": 1,
+ "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"],
+ "segmentSentences": 1,
},
timeout=180.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'GROBID request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "GROBID request (HTTP POST) timeout",
}
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
- info = dict(
- status_code=grobid_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
- info['status'] = 'success'
- info['tei_xml'] = grobid_response.text
- if len(info['tei_xml']) > 12000000:
+ info["status"] = "success"
+ info["tei_xml"] = grobid_response.text
+ if len(info["tei_xml"]) > 12000000:
# XML is larger than Kafka message size, and much larger than
# an article in general; bail out
- info['status'] = 'error'
- info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
- info.pop('tei_xml')
+ info["status"] = "error"
+ info["error_msg"] = "response XML too large: {} bytes".format(
+ len(info["tei_xml"])
+ )
+ info.pop("tei_xml")
else:
# response.text is .content decoded as utf-8
- info['status'] = 'error'
- info['error_msg'] = grobid_response.text[:10000]
+ info["status"] = "error"
+ info["error_msg"] = grobid_response.text[:10000]
return info
- def metadata(self, result):
- if result['status'] != 'success':
+ def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+ if not unstructured_list:
+ return []
+ if len(unstructured_list) > 5000:
+ raise ValueError("more than 5,000 references in a batch is just too much")
+
+ try:
+ grobid_response = self.session.post(
+ self.host_url + "/api/processCitationList",
+ data={
+ "citations": unstructured_list,
+ "consolidateCitations": 0,
+ "includeRawCitations": 1,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout as te:
+ # TODO: handle somehow?
+ raise te
+
+ grobid_response.raise_for_status()
+ return parse_citation_list_xml(grobid_response.text)
+
+ def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ if result["status"] != "success":
return None
- tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ try:
+ tei_doc = parse_document_xml(result["tei_xml"])
+ except xml.etree.ElementTree.ParseError as pe:
+ result["status"] = "bad-grobid-xml"
+ return dict(error_msg=str(pe)[:1000])
+ tei_doc.remove_encumbered()
+ tei_json = tei_doc.to_legacy_dict()
meta = dict()
biblio = dict()
- for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ for k in (
+ "title",
+ "authors",
+ "journal",
+ "date",
+ "doi",
+ ):
if tei_json.get(k):
biblio[k] = tei_json[k]
- meta['biblio'] = biblio
- for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ meta["biblio"] = biblio
+ for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
if tei_json.get(k):
meta[k] = tei_json[k]
return meta
-class GrobidWorker(SandcrawlerFetchWorker):
+ def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+ """
+ Helper function to decide whether to run GROBID parsing on an crossref
+ reference.
- def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+ For example, if there is already a DOI in the ref metadata, could skip.
+ Or, if there is sufficient structured metadata, or only depending on
+ the source of the DOI linkage.
+ """
+ if ref.get("DOI"):
+ return False
+ if len(ref.get("unstructured", "").strip()) <= 6:
+ return False
+
+ if (
+ ref.get("year")
+ and ref.get("author")
+ and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+ ):
+ return False
+ elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+ return False
+ elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+ return False
+
+ return True
+
+ def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Given a complete Crossref metadata record, inspects the
+
+ The returned dict is in the schema of the `grobid_refs` database table,
+ in dict form:
+
+ source: 'crossref'
+ source_id: doi, as lower-case string
+ source_ts: Crossref indexed timestamp, if available
+ ('updated' is not set)
+ refs_json: list of dicts
+ """
+
+ # remove API wrapper around record, if necessary
+ if "message" in record and "DOI" not in record:
+ record = record["message"]
+
+ ret = dict(
+ source="crossref",
+ source_id=record["DOI"].lower(),
+ source_ts=record["indexed"]["date-time"],
+ refs_json=[],
+ )
+ all_refs = record.get("reference", [])
+ unstructured_refs = []
+ for r in all_refs:
+ if not r.get("unstructured"):
+ continue
+ if not self.should_parse_crossref_ref(r):
+ continue
+ unstructured_refs.append(r)
+ if not unstructured_refs:
+ return ret
+
+ # some reasonable cap on length of refs per work
+ if len(unstructured_refs) > 2000:
+ print(
+ f"truncating very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+ file=sys.stderr,
+ )
+ unstructured_refs = unstructured_refs[:2000]
+
+ clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
+ refs = self.process_citation_list(clean_refs)
+
+ assert len(refs) == len(unstructured_refs)
+ refs_json = []
+ for i in range(len(refs)):
+ refs[i].id = unstructured_refs[i].get("key")
+ refs[i].index = None
+ refs_json.append(refs[i].to_dict())
+ ret["refs_json"] = refs_json
+ return ret
+
+
+class GrobidWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ grobid_client: GrobidClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def timeout_response(self, task):
- default_key = task['sha1hex']
+ def timeout_response(self, task: Any) -> Any:
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
error_msg="internal GROBID worker timeout",
@@ -95,37 +303,74 @@ class GrobidWorker(SandcrawlerFetchWorker):
key=default_key,
)
- def process(self, record, key=None):
- default_key = record['sha1hex']
-
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['source'] = record
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["source"] = record
+ result["key"] = result["file_meta"]["sha1hex"]
return result
+
+class CrossrefRefsWorker(SandcrawlerWorker):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.grobid_client = grobid_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ # handle the rare case of bad TEI-XML response
+ # eg: https://github.com/kermitt2/grobid/issues/848
+ try:
+ return self.grobid_client.crossref_refs(record)
+ except xml.etree.ElementTree.ParseError:
+ print(
+ f"GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+ file=sys.stderr,
+ )
+ # but add a small slow-down so we don't churn through these if
+ # GROBID is just misconfigured or something
+ time.sleep(3)
+ return None
+ except requests.exceptions.HTTPError:
+ print(f"GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+ except requests.exceptions.ReadTimeout:
+ print(f"GROBID HTTP timeout for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+
+
class GrobidBlobWorker(SandcrawlerWorker):
"""
This is sort of like GrobidWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, grobid_client, sink=None, **kwargs):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
super().__init__()
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
return result
-
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 88ea41b..207f067 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,36 +1,20 @@
-
+import json
import re
import sys
-import json
import urllib.parse
+from typing import Any, Dict
from bs4 import BeautifulSoup
-RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"')
+RESEARCHSQUARE_REGEX = re.compile(
+ r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"'
+)
IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
-def test_regex():
- lines = """
- blah
- var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
- asdf"""
- m = OVID_JOURNAL_URL_REGEX.search(lines)
- assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
-
- lines = """
- window.onload = function () {
- window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client';
- refreshOriginalWindow();
- }
- """
- url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
- m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
- assert m.group(1) == url
-
-def extract_fulltext_url(html_url, html_body):
+def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
"""
Takes an HTML document (and URL), assumed to be a landing page, and tries
to find a fulltext PDF url.
@@ -38,9 +22,9 @@ def extract_fulltext_url(html_url, html_body):
On error, or if fails to extract a URL, returns an empty dict.
"""
- host_prefix = '/'.join(html_url.split('/')[:3])
+ host_prefix = "/".join(html_url.split("/")[:3])
try:
- soup = BeautifulSoup(html_body, 'html.parser')
+ soup = BeautifulSoup(html_body, "html.parser")
except TypeError as te:
print(f"{te} (url={html_url})", file=sys.stderr)
return dict()
@@ -48,97 +32,75 @@ def extract_fulltext_url(html_url, html_body):
print(f"{ule} (url={html_url})", file=sys.stderr)
return dict()
+ # ignoring most type checks on bs4 output in this function (which is partially deprecated)
+ meta: Any
+ url: Any
+ redirect: Any
+
### General Tricks ###
+ # note: most of these have migrated to the html_biblio code path
- # highwire-style meta tag
- meta = soup.find('meta', attrs={"name":"citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
- if not meta:
- # researchgate does this; maybe others also?
- meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
- # if tag is only partially populated
- if meta and not meta.get('content'):
- meta = None
- # wiley has a weird almost-blank page we don't want to loop on
- if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
- url = meta['content'].strip()
- if url.startswith('/'):
- return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
- elif url.startswith('http'):
- return dict(pdf_url=url, technique='citation_pdf_url')
- else:
- print("malformed citation_pdf_url? {}".format(url), file=sys.stderr)
-
- # sage, and also utpjournals (see below)
- # https://journals.sagepub.com/doi/10.1177/2309499019888836
- # <a href="http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836" class="show-pdf" target="_self">
- # <a href="http://utpjournals.press/doi/pdf/10.3138/cjh.ach.54.1-2.05" class="show-pdf" target="_blank">
- href = soup.find('a', attrs={"class":"show-pdf"})
- if href:
- url = href['href'].strip()
- if url.startswith('http'):
- return dict(pdf_url=url, technique='href_show-pdf')
-
- # ACS (and probably others) like:
- # https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379
- # <a href="/doi/pdf/10.1021/acs.estlett.9b00379" title="PDF" target="_blank" class="button_primary"><i class="icon-file-pdf-o"></i><span>PDF (1 MB)</span></a>
- href = soup.find('a', attrs={"title":"PDF"})
- if href and 'href' in href:
- url = href['href'].strip()
- if url.startswith('http'):
- return dict(pdf_url=url, technique='href_title')
- elif url.startswith('/'):
- return dict(pdf_url=host_prefix+url, technique='href_title')
-
- # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401
- # <embed src="/files/jass_makaleler/1359848334_33-Okt.%20Yasemin%20KARADEM%C4%B0R.pdf" type="application/pdf" />
- embed = soup.find('embed', attrs={"type": "application/pdf"})
- if embed:
- url = embed['src'].strip()
- if url.startswith('/'):
- url = host_prefix+url
- if url.startswith('http'):
- return dict(pdf_url=url, technique='embed_type')
+ meta = soup.find("meta", attrs={"name": "generator"})
+ meta_generator = None
+ if meta and meta.get("content"):
+ meta_generator = meta["content"].strip()
### Publisher/Platform Specific ###
- # eLife (elifesciences.org)
- if '://elifesciences.org/articles/' in html_url:
- anchor = soup.find("a", attrs={"data-download-type": "pdf-article"})
- if anchor:
- url = anchor['href'].strip()
- assert '.pdf' in url
- return dict(pdf_url=url, technique='publisher')
-
# research square (researchsquare.com)
- if 'researchsquare.com/article/' in html_url:
+ if "researchsquare.com/article/" in html_url:
# JSON in body with a field like:
# "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf"
- m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8'))
+ m = RESEARCHSQUARE_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(release_stage="manuscript", pdf_url=url, technique='publisher')
+ return dict(release_stage="manuscript", pdf_url=url, technique="publisher")
# elseiver linking hub
# https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975
- if '://linkinghub.elsevier.com/retrieve/pii/' in html_url:
+ if "://linkinghub.elsevier.com/retrieve/pii/" in html_url:
# <input type="hidden" name="redirectURL" value="http%3A%2F%2Fcysticfibrosisjournal.com%2Fretrieve%2Fpii%2FS1569199319308975" id="redirectURL"/>
redirect = soup.find("input", attrs={"name": "redirectURL"})
if redirect:
- url = redirect['value'].strip()
- if 'http' in url:
+ url = redirect["value"].strip()
+ if "http" in url:
url = urllib.parse.unquote(url)
# drop any the query parameter
- url = url.split('?via')[0]
+ url = url.split("?via")[0]
return dict(next_url=url, technique="elsevier-linkinghub")
+ # sciencedirect PDF URL extract
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+ if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"):
+ json_tag: Any = soup.find(
+ "script", attrs={"type": "application/json", "data-iso-key": "_0"}
+ )
+ url = None
+ if json_tag:
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta["article"]["pdfDownload"]["urlMetadata"]
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = (
+ html_url
+ + pdf_meta["pdfExtension"]
+ + "?md5="
+ + pdf_meta["queryParams"]["md5"]
+ + "&pid="
+ + pdf_meta["queryParams"]["pid"]
+ )
+ except (KeyError, TypeError, json.JSONDecodeError):
+ pass
+ if url:
+ return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
# sciencedirect PDF bounce page
# https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
- if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
+ if "://www.sciencedirect.com/" in html_url and html_url.endswith(".pdf"):
# window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client';
- m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode('utf-8'))
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4000
@@ -146,115 +108,108 @@ def extract_fulltext_url(html_url, html_body):
# ieeexplore.ieee.org
# https://ieeexplore.ieee.org/document/8730316
- if '://ieeexplore.ieee.org/document/' in html_url:
+ if "://ieeexplore.ieee.org/document/" in html_url:
# JSON in body with a field like:
# "pdfPath":"/iel7/6287639/8600701/08730316.pdf",
- m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8'))
+ m = IEEEXPLORE_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(release_stage="published", pdf_url=host_prefix+url, technique="ieeexplore")
+ return dict(
+ release_stage="published", pdf_url=host_prefix + url, technique="ieeexplore"
+ )
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
- if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url:
+ if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url:
# HTML iframe like:
# <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
- iframe = soup.find("iframe")
- if iframe and '.pdf' in iframe['src']:
- return dict(pdf_url=iframe['src'], technique="iframe")
-
- # utpjournals.press
- # https://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05
- if '://utpjournals.press/doi/10.' in html_url:
- # <a href="http://utpjournals.press/doi/pdf/10.3138/cjh.ach.54.1-2.05" class="show-pdf" target="_blank">
- href = soup.find('a', attrs={"class":"show-pdf"})
- if href:
- url = href['href'].strip()
- if url.startswith('http'):
- return dict(pdf_url=url, technique='publisher-href')
-
- # https://www.jcancer.org/v10p4038.htm
- # simple journal-specific href
- if '://www.jcancer.org/' in html_url and html_url.endswith(".htm"):
- # <a href='v10p4038.pdf' class='textbutton'>PDF</a>
- href = soup.find('a', attrs={"class":"textbutton"})
- if href:
- url = href['href'].strip()
- if url.endswith(".pdf") and not "http" in url:
- return dict(pdf_url=host_prefix+"/"+url, technique='journal-href')
+ iframe: Any = soup.find("iframe")
+ if iframe and ".pdf" in iframe["src"]:
+ return dict(pdf_url=iframe["src"], technique="iframe")
# https://insights.ovid.com/crossref?an=00042307-202001000-00013
# Ovid is some kind of landing page bounce portal tracking run-around.
# Can extract actual journal URL from javascript blob in the HTML
- if '://insights.ovid.com/crossref' in html_url:
+ if "://insights.ovid.com/crossref" in html_url:
# var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
- m = OVID_JOURNAL_URL_REGEX.search(html_body.decode('utf-8'))
+ m = OVID_JOURNAL_URL_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(next_url=url, technique='ovid')
+ return dict(next_url=url, technique="ovid")
# osf.io
# https://osf.io/8phvx/
# https://osf.io/preprints/socarxiv/8phvx/
# wow, they ship total javascript crud! going to just guess download URL
# based on URL for now. Maybe content type header would help?
- if '://osf.io/' in html_url and not '/download' in html_url:
- if not html_url.endswith("/"):
- next_url = html_url+"/download"
- else:
- next_url = html_url+"download"
- return dict(next_url=next_url, technique='osf-by-url')
+ OSF_DOMAINS = [
+ "://osf.io/",
+ "://biohackrxiv.org/",
+ "://psyarxiv.com/",
+ "://arabixiv.org/",
+ "://engrxiv.org/",
+ "://edarxiv.org//",
+ "://ecsarxiv.org/",
+ "://ecoevorxiv.org/",
+ "://frenxiv.org/",
+ "://indiarxiv.org/",
+ "://mindrxiv.org/",
+ "://mediarxiv.org/",
+ "://paleorxiv.org/",
+ "://thesiscommons.org/",
+ ]
+ for domain in OSF_DOMAINS:
+ if (
+ domain in html_url
+ and (len(html_url.split("/")) in [4, 5] or "/preprints/" in html_url)
+ and "/download" not in html_url
+ ):
+ if not html_url.endswith("/"):
+ next_url = html_url + "/download"
+ else:
+ next_url = html_url + "download"
+ return dict(next_url=next_url, technique="osf-by-url")
# wiley
# https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787
if "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
if b"/doi/pdfdirect/" in html_body:
- next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/')
- return dict(next_url=next_url, technique='wiley-pdfdirect')
-
- # taylor and frances
- # https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234
- # <a href="/doi/pdf/10.1080/19491247.2019.1682234?needAccess=true" class="show-pdf" target="_blank">
- if "://www.tandfonline.com/doi/full/10." in html_url:
- href = soup.find('a', attrs={"class":"show-pdf"})
- if href:
- url = href['href'].strip()
- if "/pdf/" in url:
- return dict(pdf_url=host_prefix+url, technique='publisher-href')
+ next_url = html_url.replace("/doi/pdf/", "/doi/pdfdirect/")
+ return dict(next_url=next_url, technique="wiley-pdfdirect")
# arxiv abstract pages
if "://arxiv.org/abs/" in html_url:
url = html_url.replace("/abs/", "/pdf/")
- return dict(pdf_url=url, technique='arxiv-url')
+ return dict(pdf_url=url, technique="arxiv-url")
# american archivist (OA)
# https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
- if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+ if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url:
# use a more aggressive direct guess to avoid rate-limiting...
if "/doi/10." in html_url:
url = html_url.replace("/doi/10.", "/doi/pdf/10.")
- return dict(pdf_url=url, technique='archivist-url')
+ return dict(pdf_url=url, technique="archivist-url")
# <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
- hrefs = soup.find_all('a', attrs={"target":"_blank"})
+ hrefs = soup.find_all("a", attrs={"target": "_blank"})
for href in hrefs:
- url = href['href'].strip()
+ url = href["href"].strip()
if "/doi/pdf/" in url:
- if url.startswith('http'):
- return dict(pdf_url=url, technique='publisher-href')
- elif url.startswith('/'):
- return dict(pdf_url=host_prefix+url, technique='publisher-href')
+ if url.startswith("http"):
+ return dict(pdf_url=url, technique="publisher-href")
+ elif url.startswith("/"):
+ return dict(pdf_url=host_prefix + url, technique="publisher-href")
# protocols.io
# https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"):
url = html_url + ".pdf"
- return dict(pdf_url=url, technique='protocolsio-url')
+ return dict(pdf_url=url, technique="protocolsio-url")
# degruyter.com
# https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml
if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"):
- url = html_url.replace('/view/', '/downloadpdf/').replace('.xml', '.pdf')
- return dict(pdf_url=url, technique='degruyter-url')
+ url = html_url.replace("/view/", "/downloadpdf/").replace(".xml", ".pdf")
+ return dict(pdf_url=url, technique="degruyter-url")
# journals.lww.com (Wolters Kluwer)
# https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx
@@ -262,80 +217,149 @@ def extract_fulltext_url(html_url, html_body):
# we never get the content.
if "://journals.lww.com/" in html_url and False:
# data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
- for line in html_body.split(b'\n'):
+ for line in html_body.split(b"\n"):
if b"data-pdf-url=" in line:
- line = line.decode('utf-8')
- url = line.strip().replace('data-pdf-url=', '').replace('"', '')
- if url.startswith('http') and 'pdfs.journals.lww.com' in url:
- return dict(pdf_url=url, technique='journals.lww.com-jsvar')
+ line = line.decode("utf-8")
+ url = line.strip().replace("data-pdf-url=", "").replace('"', "")
+ if url.startswith("http") and "pdfs.journals.lww.com" in url:
+ return dict(pdf_url=url, technique="journals.lww.com-jsvar")
# www.ahajournals.org
# https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
- if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+ if "://www.ahajournals.org/doi/" in html_url and "/doi/pdf/" not in html_url:
# <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
- if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/10.', '/doi/pdf/10.')
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
url = url + "?download=true"
- return dict(pdf_url=url, technique='ahajournals-url')
+ return dict(pdf_url=url, technique="ahajournals-url")
# ehp.niehs.nih.gov
# https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
# https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
if "://ehp.niehs.nih.gov/doi/" in html_url:
# <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
- if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
- return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
-
- # journals.tsu.ru (and maybe others)
- # http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405
- # <a class='file pdf' href='http://journals.tsu.ru/engine/download.php?id=150921&area=files'>Скачать Ñлектронную верÑию публикации</a>
- href = soup.find('a', attrs={"class":"file pdf"})
- if href:
- url = href['href'].strip()
- if url.startswith('http'):
- return dict(pdf_url=url, technique='href_file_pdf-pdf')
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/full/10.", "/doi/pdf/10.").replace(
+ "/doi/10.", "/doi/pdf/10."
+ )
+ return dict(pdf_url=url, technique="ehp.niehs.nigh.gov-url")
# cogentoa.com
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
- if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+ if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url:
# blech, it's a SPA! All JS
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
url = html_url + ".pdf"
- return dict(pdf_url=url, technique='cogentoa-url')
+ return dict(pdf_url=url, technique="cogentoa-url")
# chemrxiv.org (likely to be other figshare domains also)
# https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419
- if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url:
+ if "://chemrxiv.org/articles/" in html_url or ".figshare.org/articles/" in html_url:
# <script id="app-data" type="text/json"> [...] </script>
- json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"})
+ json_tag = soup.find("script", id="app-data", attrs={"type": "text/json"})
if json_tag and json_tag.string:
app_data = json.loads(json_tag.string)
# "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf"
- url = app_data.get('article', {}).get('exportPdfDownloadUrl')
- if url and url.startswith('http'):
- return dict(pdf_url=url, technique='figshare-json')
-
- # eurosurveillance
- # https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230
- if "://www.eurosurveillance.org/content/" in html_url:
- # <a href="/deliver/fulltext/eurosurveillance/25/11/eurosurv-25-11-3.pdf?itemId=/content/10.2807/1560-7917.ES.2020.25.11.2000230&mimeType=pdf&containerItemId=content/eurosurveillance" class="pdf " title="Download" rel="http://instance.metastore.ingenta.com/content/10.2807/1560-7917.ES.2020.25.11.2000230" target="/content/10.2807/1560-7917.ES.2020.25.11.2000230-pdf" >
- href = soup.find('a', attrs={"class":"pdf", "title": "Download"})
- if href:
- url = href['href'].strip()
- if not url.startswith('http'):
- url = host_prefix + url
- return dict(pdf_url=url, technique='eurosurveillance-href')
+ url = app_data.get("article", {}).get("exportPdfDownloadUrl")
+ if url and url.startswith("http"):
+ return dict(pdf_url=url, technique="figshare-json")
# CNKI COVID-19 landing pages
# http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
- if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
+ if "://en.gzbd.cnki.net/KCMS/detail/detail.aspx" in html_url:
# <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA; "><i></i>PDF Download</a>
- href = soup.find('a', attrs={"id":"pdfDown"})
+ href = soup.find("a", attrs={"id": "pdfDown"})
if href:
- url = href['href'].strip().replace('&#xA;', '')
- if not url.startswith('http'):
+ url = href["href"].strip().replace("&#xA;", "")
+ if not url.startswith("http"):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique="cnki-href")
+
+ # RWTH AACHEN repository
+ if "://publications.rwth-aachen.de/record/" in html_url:
+ record_id = html_url.split("/")[-1]
+ url = f"{html_url}/files/{record_id}.pdf"
+ if record_id.isdigit() and url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="rwth-aachen-url")
+
+ # physchemaspects.ru
+ if "://physchemaspects.ru/" in html_url and soup:
+ for href in soup.find_all("a"):
+ if href.text == "download PDF file":
+ url = href["href"]
+ if url.startswith("/"):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique="physchemaspects-href")
+
+ # OJS 3 (some)
+ if meta_generator and meta_generator.startswith("Open Journal Systems"):
+ href = soup.find("a", attrs={"class": "obj_galley_link file"})
+ if href and href.text and "pdf" in href.text.lower():
+ url = href["href"].strip()
+ if url.startswith("/"):
url = host_prefix + url
- return dict(pdf_url=url, technique='cnki-href')
+ return dict(pdf_url=url, technique="ojs-galley-href")
+
+ # ETH zurich e-periodica
+ if "://www.e-periodica.ch/digbib/view" in html_url:
+ url = html_url.replace("digbib/view", "cntmng").split("#")[0]
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="href-eperiodica")
+
+ # JMIR
+ # https://mhealth.jmir.org/2020/7/e17891/
+ if ".jmir.org/" in html_url and "/pdf" not in html_url and html_url.endswith("/"):
+ url = html_url + "pdf"
+ return dict(pdf_url=url, technique="jmir-url")
+
+ # Google Drive
+ # this is assuming it is a PDF
+ if "drive.google.com/file/d/" in html_url and "/view" in html_url:
+ gdrive_id = html_url.split("/")[5]
+ if len(gdrive_id) > 10:
+ # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
+
+ # https://doi.org/10.24850/j-tyca-14-4-7
+ # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+ if "docs.google.com/viewer?url=" in html_url:
+ original_url = html_url.split("?url=")[1]
+ if original_url:
+ return dict(pdf_url=original_url, technique="docs.google.com viewer")
+
+ ### below here we are doing guesses
+
+ # generic guess: try current URL plus .pdf, if it exists in the HTML body
+ if ".pdf" not in html_url:
+ url = html_url + ".pdf"
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="guess-url-plus-pdf")
return dict()
+
+
+def test_regex() -> None:
+ lines = """
+ blah
+ var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
+ asdf"""
+ m = OVID_JOURNAL_URL_REGEX.search(lines)
+ assert m
+ assert (
+ m.group(1)
+ == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+ )
+
+ lines = """
+ window.onload = function () {
+ window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client';
+ refreshOriginalWindow();
+ }
+ """
+ url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+ assert m
+ assert m.group(1) == url
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
new file mode 100644
index 0000000..1e2d197
--- /dev/null
+++ b/python/sandcrawler/html_metadata.py
@@ -0,0 +1,1077 @@
+import datetime
+import sys
+import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
+
+import braveblock
+import dateparser
+import pydantic
+from selectolax.parser import HTMLParser
+
+from sandcrawler.misc import url_fuzzy_equal
+
+# this is a map of metadata keys to CSS selectors
+# sources for this list include:
+# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
+# - inspection of actual publisher HTML
+# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata
+# - "HTML meta tags used by journal articles"
+# https://gist.github.com/hubgit/5985963
+# order of these are mostly by preference/quality (best option first), though
+# also/sometimes re-ordered for lookup efficiency (lookup stops after first
+# match)
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
+ "title": [
+ "meta[name='citation_title']",
+ "meta[name='eprints.title']",
+ "meta[name='prism.title']",
+ "meta[name='bepress_citation_title']",
+ "meta[name='og:title']",
+ "meta[name='dcterms.title']",
+ "meta[name='dc.title']",
+ ],
+ "subtitle": [
+ "meta[name='prism.subtitle']",
+ ],
+ "doi": [
+ "meta[name='citation_doi']",
+ "meta[name='DOI']",
+ "meta[id='DOI']",
+ "meta[name='prism.doi']",
+ "meta[name='bepress_citation_doi']",
+ "meta[name='dc.identifier.doi']",
+ "meta[name='dc.identifier'][scheme='doi']",
+ ],
+ "pmid": [
+ "meta[name='citation_pmid']",
+ ],
+ "abstract": [
+ "meta[name='citation_abstract']",
+ "meta[name='bepress_citation_abstract']",
+ "meta[name='eprints.abstract']",
+ "meta[name='dcterms.abstract']",
+ "meta[name='prism.teaser']",
+ "meta[name='dc.description']",
+ "meta[name='og:description']",
+ ],
+ "container_name": [
+ "meta[name='citation_journal_title']",
+ "meta[name='bepress_citation_journal_title']",
+ "meta[name='citation_conference_title']",
+ "meta[name='bepress_citation_conference_title']",
+ "meta[name='prism.publicationName']",
+ "meta[name='eprints.publication']",
+ "meta[name='dc.relation.ispartof']",
+ "meta[name='dc.source']",
+ "meta[property='og:site_name']",
+ ],
+ "container_abbrev": [
+ "meta[name='citation_journal_abbrev']",
+ ],
+ "raw_date": [
+ "meta[name='citation_publication_date']",
+ "meta[name='bepress_citation_publication_date']",
+ "meta[name='prism.publicationDate']",
+ "meta[name='citation_date']",
+ "meta[name='bepress_citation_date']",
+ "meta[name='citation_online_date']",
+ "meta[name='bepress_citation_online_date']",
+ "meta[itemprop='datePublished']",
+ "meta[name='article:published']",
+ "meta[name='eprints.datestamp']",
+ "meta[name='eprints.date']",
+ "meta[name='dc.date.created']",
+ "meta[name='dc.issued']",
+ "meta[name='dcterms.date']",
+ "meta[name='dc.date']",
+ ],
+ "release_year": [
+ "meta[itemprop='citation_year']",
+ "meta[itemprop='prism:copyrightYear']",
+ ],
+ "first_page": [
+ "meta[name='citation_firstpage']",
+ "meta[name='bepress_citation_firstpage']",
+ "meta[name='prism.startingPage']",
+ "meta[name='dc.citation.spage']",
+ ],
+ "last_page": [
+ "meta[name='citation_lastpage']",
+ "meta[name='bepress_citation_lastpage']",
+ "meta[name='prism.endingPage']",
+ "meta[name='dc.citation.epage']",
+ ],
+ "issue": [
+ "meta[name='citation_issue']",
+ "meta[name='bepress_citation_issue']",
+ "meta[name='prism.issueIdentifier']",
+ "meta[name='dc.citation.issue']",
+ ],
+ "volume": [
+ "meta[name='citation_volume']",
+ "meta[name='bepress_citation_volume']",
+ "meta[name='prism.volume']",
+ "meta[name='dc.citation.volume']",
+ ],
+ "number": [
+ "meta[name='citation_technical_report_number']",
+ "meta[name='bepress_citation_technical_report_number']",
+ "meta[name='citation_number']",
+ "meta[name='bepress_citation_number']",
+ "meta[name='prism.number']",
+ ],
+ "container_issn": [
+ "meta[name='citation_issn']",
+ "meta[name='bepress_citation_issn']",
+ "meta[name='prism.issn']",
+ "meta[name='prism.eIssn']",
+ "meta[name='eprints.issn']",
+ "meta[name='dc.source.issn']",
+ ],
+ "isbn": [
+ "meta[name='citation_isbn']",
+ "meta[name='bepress_citation_isbn']",
+ "meta[name='prism.isbn']",
+ ],
+ "publisher": [
+ "meta[name='citation_publisher']",
+ "meta[name='bepress_citation_publisher']",
+ "meta[name='eprints.publisher']",
+ "meta[name='citation_technical_report_institution']",
+ "meta[name='dcterms.publisher']",
+ "meta[name='dc.publisher']",
+ ],
+ "raw_release_type": [
+ "meta[name='citation_article_type']",
+ "meta[name='bepress_citation_article_type']",
+ "meta[name='prism.contentType']",
+ "meta[name='eprints.type']",
+ "meta[name='dc.type']",
+ ],
+ "lang": [
+ "meta[name='citation_language']",
+ "meta[name='bepress_citation_language']",
+ "meta[name='dcterms.language']",
+ "meta[name='dc.language']",
+ "meta[name='og:locale']",
+ ],
+}
+
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
+ "contrib_names": [
+ "meta[name='citation_author']",
+ "meta[name='bepress_citation_author']",
+ "meta[name='eprints.creators_name']",
+ "meta[name='dcterms.creator']",
+ "meta[name='article:author']",
+ "meta[name='dc.creator']",
+ "meta[name='dc.contributor']",
+ ],
+ # TODO: citation_author_institution
+ "raw_references": [
+ "meta[name='citation_reference']",
+ ],
+ "raw_identifiers": [
+ "meta[name='eprints.id_number']",
+ "meta[name='dcterms.identifier']",
+ "meta[name='dc.identifier']",
+ ],
+}
+
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "selector": "meta[name='citation_xml_url']",
+ "attr": "content",
+ "technique": "citation_xml_url",
+ },
+ {
+ "selector": "meta[name='fulltext_xml']",
+ "attr": "content",
+ "technique": "fulltext_xml",
+ },
+ {
+ "selector": "link[rel='alternate'][type='application/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "selector": "link[rel='alternate'][type='text/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "scielo",
+ "in_fulltext_url": "articleXML",
+ "selector": "a[target='xml']",
+ "attr": "href",
+ "technique": "SciElo XML link",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "viewXML",
+ "selector": "a[class='obj_galley_link']",
+ "attr": "href",
+ "technique": "OJS Gallery XML link",
+ },
+ {
+ "in_fulltext_url": "/download/xml/",
+ "selector": "a[title='XML']",
+ "attr": "href",
+ "technique": "ARPHA XML link",
+ "example_page": "https://zookeys.pensoft.net/article/26391",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "xml",
+ "selector": "a.download-files-nlm",
+ "attr": "href",
+ "technique": "XML (NLM) download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+]
+
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "selector": "meta[name='citation_fulltext_html_url']",
+ "attr": "content",
+ "technique": "citation_fulltext_html_url",
+ },
+ {
+ "selector": "link[rel='alternate'][type='text/html']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "inline=1",
+ "selector": "iframe[name='htmlFrame']",
+ "attr": "src",
+ "technique": "OJS HTML iframe",
+ },
+ {
+ "in_doc_url": "dovepress.com",
+ "in_fulltext_url": "-fulltext-",
+ "selector": "a[id='view-full-text']",
+ "attr": "href",
+ "technique": "dovepress fulltext link",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+]
+
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "in_doc_url": "pensoft.net/article/", # also /element/
+ "in_fulltext_url": "/download/fig/",
+ "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+ "attr": "href",
+ "technique": "Active figure download link (zookeys)",
+ "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+ },
+ {
+ "in_doc_url": "/file.xhtml?persistentId",
+ "in_fulltext_url": "/access/datafile/",
+ "selector": "div.form-group code",
+ "use_body": "true",
+ "technique": "Dataverse 'download URL'",
+ "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
+ },
+]
+
+# This is a database of matching patterns. Most of these discovered by hand,
+# looking at OA journal content that failed to craw/ingest.
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "selector": "head meta[name='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "selector": "head meta[name='bepress_citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "in_doc_url": "journals.lww.com",
+ "selector": "head meta[name='wkhealth_pdf_url']",
+ "attr": "content",
+ "technique": "wkhealth_pdf_url",
+ "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
+ },
+ {
+ "selector": "head meta[property='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ # eg, researchgate
+ },
+ {
+ "selector": "head meta[name='eprints.document_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url (property)",
+ },
+ {
+ "in_doc_url": "/doi/10.",
+ "in_fulltext_url": "/doi/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "SAGE/UTP show-pdflink",
+ "example_page": "https://journals.sagepub.com/doi/10.1177/2309499019888836",
+ # also http://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05
+ },
+ {
+ "in_doc_url": "/doi/10.",
+ "in_fulltext_url": "/doi/pdf/",
+ "selector": "a[title='PDF']",
+ "attr": "href",
+ "technique": "title=PDF link",
+ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
+ },
+ {
+ "in_doc_url": "/view/",
+ "selector": "a#pdfDownloadLink",
+ "attr": "href",
+ "technique": "OJS pdfDownloadLink link",
+ "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
+ },
+ {
+ "in_fulltext_url": "/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "SAGE PDF link",
+ "example_page": "http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836",
+ },
+ {
+ "in_doc_url": "://elifesciences.org/articles/",
+ "in_fulltext_url": "/download/",
+ "selector": "a[data-download-type='pdf-article']",
+ "attr": "href",
+ "technique": "eLife PDF link",
+ "example_page": "https://elifesciences.org/articles/59841",
+ },
+ {
+ "in_doc_url": "://www.jcancer.org/",
+ "in_fulltext_url": ".pdf",
+ "selector": ".divboxright a.text-button",
+ "attr": "href",
+ "technique": "jcancer PDF link",
+ "example_page": "https://www.jcancer.org/v10p4038.htm",
+ },
+ {
+ "in_doc_url": "://www.tandfonline.com/doi/full/10.",
+ "in_fulltext_url": "/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "t+f show-pdf link",
+ "example_page": "https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234",
+ },
+ {
+ "in_doc_url": "article_id=",
+ "in_fulltext_url": "download.php",
+ "selector": "a.file.pdf",
+ "attr": "href",
+ "technique": "pdf file link",
+ "example_page": "http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405",
+ },
+ {
+ "in_doc_url": "/content/10.",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[title='Download']",
+ "attr": "href",
+ "technique": "pdf file link",
+ "example_page": "https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230",
+ },
+ {
+ "selector": "embed[type='application/pdf']",
+ "attr": "src",
+ "technique": "PDF embed",
+ "example_page": "http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401",
+ },
+ {
+ "in_doc_url": "/html/",
+ "in_fulltext_url": "create_pdf",
+ "selector": ".AbsPdfFigTab img[src='images/pdf-icon.jpg'] + a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.aed.org.cn/nyzyyhjxb/html/2018/4/20180408.htm",
+ },
+ {
+ "in_doc_url": "/archive-detail/",
+ "in_fulltext_url": ".pdf",
+ "selector": ".contact-list a.download-pdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
+ },
+ {
+ "in_doc_url": "degruyter.com/document/",
+ "in_fulltext_url": "/pdf",
+ "selector": "a.downloadPdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+ },
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
+ {
+ "in_doc_url": "library.wur.nl/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.wl_full_text_restricted",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922",
+ },
+ {
+ "in_doc_url": "/dlibra/",
+ "in_fulltext_url": "pdf",
+ "selector": "iframe#js-main-frame",
+ "attr": "src",
+ "technique": "PDF iframe (dlibra)",
+ "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031",
+ },
+ {
+ "in_doc_url": "/handle/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.misc table.inner tr.b a",
+ "attr": "href",
+ "technique": "PDF URL link (DSpace, first file)",
+ "example_page": "https://orbi.uliege.be/handle/2268/174200",
+ },
+ {
+ "in_doc_url": "/publications/",
+ "in_fulltext_url": "pdf",
+ "selector": ".publication-sidebar li.open-access a.document-link",
+ "attr": "href",
+ "technique": "PDF URL link (Pure repo, OA link)",
+ "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance",
+ },
+ {
+ "in_doc_url": "//hal",
+ "selector": ".widget-openaccess .widget-content a",
+ "attr": "href",
+ "technique": "Fulltext OA URL (HAL)",
+ "example_page": "https://hal.archives-ouvertes.fr/hal-00744951",
+ },
+ {
+ "in_doc_url": "/record/",
+ "in_fulltext_url": "pdf",
+ "selector": "#detailedrecordminipanelfile a",
+ "attr": "href",
+ "technique": "PDF URL link (Invenio)",
+ "example_page": "https://bib-pubdb1.desy.de/record/416556",
+ },
+ {
+ "in_doc_url": "/available/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.file-table a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/",
+ },
+ {
+ "in_doc_url": "/islandora/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.islandora-pdf-link",
+ "attr": "href",
+ "technique": "PDF URL link (Islandora)",
+ "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804",
+ },
+ {
+ "in_doc_url": "/receive/",
+ "in_fulltext_url": "pdf",
+ "selector": ".mir-preview noscript a",
+ "attr": "href",
+ "technique": "PDF iframe via noscript (MyCoRe)",
+ "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191",
+ },
+ {
+ "in_doc_url": "/registro.do",
+ "in_fulltext_url": "imagenes",
+ "selector": ".resumen_bib a[data-analytics=media]",
+ "attr": "href",
+ "technique": "Media link (DIGIBIS)",
+ "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740",
+ },
+ {
+ "in_doc_url": "/view",
+ "in_fulltext_url": "/at_download/",
+ "selector": ".documentContent #content a",
+ "attr": "href",
+ "technique": "Media link (Plone)",
+ "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
+ },
+ {
+ "in_doc_url": "isca-speech.org/",
+ "in_fulltext_url": "pdf",
+ "selector": ".w3-container a",
+ "attr": "href",
+ "technique": "PDF URL link (isca-speech.org)",
+ "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+ },
+ {
+ "in_doc_url": "://repository.dri.ie/",
+ "in_fulltext_url": "/download",
+ "selector": "#dri_download_assets > div > a",
+ "attr": "href",
+ "technique": "Download link (repository.dri.ie)",
+ "example_page": "https://repository.dri.ie/catalog/qf8621102",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.download-files-pdf",
+ "attr": "href",
+ "technique": "PDF Download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+ {
+ "in_doc_url": "cureus.com/",
+ "in_fulltext_url": "pdf",
+ "selector": ".small-medium-pdf a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF Download link (cureus.com)",
+ "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+ },
+ {
+ "in_doc_url": "e-manuscripta.ch/",
+ "in_fulltext_url": "pdf",
+ "selector": "#titleinfoPdfDownload a.resourceLink",
+ "attr": "href",
+ "technique": "PDF Download link (e-manuscripta.ch)",
+ "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+ },
+ {
+ "in_doc_url": "journals.uchicago.edu",
+ "in_fulltext_url": "pdf",
+ "selector": "nav.article__navbar a.ctrl--pdf",
+ "attr": "href",
+ "technique": "PDF Download link (journals.uchicago.edu)",
+ "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
+ },
+ {
+ "in_doc_url": "integrityresjournals.org",
+ "in_fulltext_url": "/article-full-text-pdf/",
+ "selector": "a[target='_blank'].btn-danger",
+ "attr": "href",
+ "technique": "PDF Download link (integrityresjournals.org)",
+ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body.pkp_page_article a.download",
+ "attr": "href",
+ "technique": "OJS PDF Embed",
+ "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/",
+ "selector": "a.pdf",
+ "attr": "href",
+ "technique": "OJS PDF link",
+ },
+ {
+ "in_doc_url": "scitemed.com/article/",
+ "in_fulltext_url": ".pdf",
+ "selector": "li.tab_pdf_btn a",
+ "attr": "href",
+ "technique": "PDF link (scitemed.com)",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+ {
+ "in_doc_url": "/jvi.aspx",
+ "in_fulltext_url": "download_fulltext",
+ "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+ "attr": "href",
+ "technique": "erciyesmedj.com publication system PDF download link",
+ },
+ {
+ "selector": "body embed[alt='pdf']",
+ "attr": "src",
+ "technique": "embed PDF",
+ "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+ },
+ {
+ "in_fulltext_url": "viewPDFInterstitial",
+ "in_doc_url": "/view/",
+ "selector": "frameset frame",
+ "attr": "src",
+ "technique": "PDF iframe (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ # note this one has a special handler
+ "in_doc_url": "viewPDFInterstitial",
+ "in_fulltext_url": "://",
+ "selector": "head meta[http-equiv='refresh']",
+ "attr": "content",
+ "technique": "HTML meta refresh (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ "in_doc_url": "dlib.si/details/",
+ "in_fulltext_url": "PDF",
+ "selector": "body #FilesBox a",
+ "attr": "href",
+ "technique": "dlib.si download links",
+ "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+ },
+ {
+ "in_doc_url": "filclass.ru",
+ "in_fulltext_url": "pdf",
+ "selector": "main .pdf-article a.pdficon",
+ "attr": "href",
+ "technique": "filclass.ru PDF link",
+ "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+ },
+ {
+ "in_doc_url": "cdnsciencepub.com",
+ "in_fulltext_url": "pdf",
+ "selector": "article .info-panel a.btn--pdf",
+ "attr": "href",
+ "technique": "cdnsciencepub.com PDF link",
+ "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+ },
+ {
+ "in_doc_url": "grrjournal.com",
+ "in_fulltext_url": "pdf",
+ "selector": ".ereaders-main-section a[download]",
+ "attr": "href",
+ "technique": "grrjournal.com PDF link",
+ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "pdf",
+ "selector": "#articleFullText a.remote_pdf",
+ "attr": "href",
+ "technique": "OJS remote_pdf link",
+ "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/abs/",
+ "in_fulltext_url": "/reader/",
+ "selector": "article.container .single__download a",
+ "attr": "href",
+ "technique": "worldscientific landing pages",
+ "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/",
+ "in_fulltext_url": "/pdf/",
+ "selector": "noscript a[target='_blank']",
+ "attr": "href",
+ "technique": "worldscientific reader",
+ "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": ".container .view-content .download-article a",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": "body a.download-pdf",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/view/",
+ "selector": "body .entry_details a.pdf",
+ "attr": "href",
+ "technique": "generic OJS/preprints",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body header a.download",
+ "attr": "href",
+ "technique": "generic OJS/preprints PDF Embed",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+ },
+]
+
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
+ # wiley has a weird almost-blank page we don't want to loop on
+ "://onlinelibrary.wiley.com/doi/pdf/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "{'embed': '",
+]
+
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+ "javascript:",
+ "about:",
+]
+
+RELEASE_TYPE_MAP: Dict[str, str] = {
+ "research article": "article-journal",
+ "text.serial.journal": "article-journal",
+}
+
+
+class BiblioMetadata(pydantic.BaseModel):
+ title: Optional[str]
+ subtitle: Optional[str]
+ contrib_names: Optional[List[str]]
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+ lang: Optional[str]
+ country_code: Optional[str]
+ volume: Optional[str]
+ issue: Optional[str]
+ number: Optional[str]
+ pages: Optional[str]
+ first_page: Optional[str]
+ last_page: Optional[str]
+ license: Optional[str]
+ publisher: Optional[str]
+ container_name: Optional[str]
+ container_abbrev: Optional[str]
+ container_issn: Optional[str]
+ container_type: Optional[str]
+ raw_references: Optional[List[str]]
+
+ doi: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ publisher_ident: Optional[str]
+ oai_id: Optional[str]
+
+ abstract: Optional[str]
+ pdf_fulltext_url: Optional[str]
+ html_fulltext_url: Optional[str]
+ xml_fulltext_url: Optional[str]
+ component_url: Optional[str]
+
+ class Config:
+ json_encoders = {datetime.date: lambda dt: dt.isoformat()}
+
+
+def html_extract_fulltext_url(
+ doc_url: str, doc: HTMLParser, patterns: List[dict]
+) -> Optional[Tuple[str, str]]:
+ """
+ Tries to quickly extract fulltext URLs using a set of patterns. This
+ function is intendend to be generic across various extraction techniques.
+
+ Returns null or a tuple of (url, technique)
+ """
+ self_doc_url: Optional[Tuple[str, str]] = None
+ for pattern in patterns:
+ if "selector" not in pattern:
+ continue
+ if "in_doc_url" in pattern:
+ if pattern["in_doc_url"] not in doc_url:
+ continue
+ elem = doc.css_first(pattern["selector"])
+ if not elem:
+ continue
+ val = None
+ if "attr" in pattern:
+ val = elem.attrs.get(pattern["attr"])
+ # handle HTML redirect
+ if val and pattern["attr"] == "content" and "URL=" in val:
+ val = val.split("URL=")[1]
+ elif pattern.get("use_body"):
+ val = elem.text()
+ if "://" not in val:
+ continue
+ if not val:
+ continue
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ if "in_fulltext_url" in pattern:
+ if pattern["in_fulltext_url"] not in val:
+ continue
+ skip_matched = False
+ for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+ if skip_pattern in val.lower():
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+ if val.lower().startswith(skip_pattern):
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ if url_fuzzy_equal(doc_url, val):
+ # don't link to self, unless no other options
+ self_doc_url = (val, pattern.get("technique", "unknown"))
+ continue
+
+ # quirks modes / hacks
+ if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+ val = val[:-1]
+
+ return (val, pattern.get("technique", "unknown"))
+ if self_doc_url:
+ print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ return self_doc_url
+ return None
+
+
+def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
+
+ meta: Any = dict()
+ head = doc.css_first("head")
+ if not head:
+ print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
+ return None
+
+ for field, patterns in HEAD_META_PATTERNS.items():
+ for pattern in patterns:
+ val = head.css_first(pattern)
+ # print((field, pattern, val))
+ if val and "content" in val.attrs and val.attrs["content"]:
+ meta[field] = val.attrs["content"]
+ break
+
+ for field, patterns in HEAD_META_LIST_PATTERNS.items():
+ for pattern in patterns:
+ val_list = head.css(pattern)
+ if val_list:
+ for val in val_list:
+ if "content" in val.attrs and val.attrs["content"]:
+ if field not in meta:
+ meta[field] = []
+ meta[field].append(val.attrs["content"])
+ break
+
+ # (some) fulltext extractions
+ pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
+ if pdf_fulltext_url:
+ meta["pdf_fulltext_url"] = pdf_fulltext_url[0]
+ xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
+ if xml_fulltext_url:
+ meta["xml_fulltext_url"] = xml_fulltext_url[0]
+ html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
+ if html_fulltext_url:
+ meta["html_fulltext_url"] = html_fulltext_url[0]
+ component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+ if component_url:
+ meta["component_url"] = component_url[0]
+
+ # TODO: replace with clean_doi() et al
+ if meta.get("doi") and meta.get("doi").startswith("doi:"):
+ meta["doi"] = meta["doi"][4:]
+
+ raw_identifiers = meta.pop("raw_identifiers", [])
+ for ident in raw_identifiers:
+ if ident.startswith("doi:10."):
+ if "doi" not in meta:
+ meta["doi"] = ident.replace("doi:", "")
+ elif ident.startswith("10.") and "/" in ident:
+ if "doi" not in meta:
+ meta["doi"] = ident
+ elif ident.startswith("isbn:"):
+ if "isbn" not in meta:
+ meta["isbn"] = ident.replace("isbn:", "")
+
+ raw_date = meta.pop("raw_date", None)
+ if raw_date:
+ parsed = dateparser.parse(raw_date)
+ if parsed:
+ meta["release_date"] = parsed.date()
+
+ raw_release_type = meta.pop("raw_release_type", None)
+ if raw_release_type:
+ release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
+ if release_type:
+ meta["release_type"] = release_type
+
+ return BiblioMetadata(**meta)
+
+
+def load_adblock_rules() -> braveblock.Adblocker:
+ """
+ TODO: consider blocking very generic assets:
+ - ://fonts.googleapis.com/css*
+ - ://journals.plos.org/plosone/resource/img/icon.*
+ """
+ return braveblock.Adblocker(
+ include_easylist=True,
+ include_easyprivacy=True,
+ rules=[
+ "/favicon.ico^",
+ "||fonts.googleapis.com^",
+ "||widgets.figshare.com^",
+ "||crossmark-cdn.crossref.org^",
+ "||crossmark.crossref.org^",
+ "||platform.twitter.com^",
+ "||verify.nature.com^",
+ "||s7.addthis.com^",
+ "||www.mendeley.com^",
+ "||pbs.twimg.com^",
+ "||badge.dimensions.ai^",
+ "||recaptcha.net^",
+ "||tag.imagino.com^",
+ "||consent.cookiebot.com^",
+ "||recaptcha.net^",
+ # not sure about these CC badges (usually via a redirect)
+ # "||licensebuttons.net^",
+ # "||i.creativecommons.org^",
+ # Should we skip jquery, or other generic javascript CDNs?
+ # "||code.jquery.com^",
+ # "||ajax.googleapis.com^",
+ # "||cdnjs.cloudflare.com^",
+ # badges, "share" buttons, tracking, etc
+ "apis.google.com/js/plusone",
+ "www.google.com/recaptcha/",
+ "js/_getUACode.js"
+ # PLOS images
+ "/resource/img/icon.*.16.png^",
+ # CAIRN broken tracking tag
+ "cairn-int.info//about.php?cairn_guest=",
+ ],
+ )
+
+
+def _extract_generic(
+ doc: HTMLParser, selector: str, attrs: List[str], type_name: str
+) -> List[Dict[str, str]]:
+ resources = []
+
+ for node in doc.css(selector):
+ for attr in attrs:
+ if attr not in node.attrs:
+ continue
+ url = node.attrs.get(attr)
+ # special-case a couple meta URI prefixes which don't match with adblock rules
+ skip = False
+ for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
+ if url and url.startswith(prefix):
+ skip = True
+ break
+ if url and "/" not in url and "." not in url and " " in url:
+ # eg: "Ce fichier n'existe pas"
+ skip = True
+ if skip:
+ continue
+ if url and url.startswith("https://https://"):
+ url = url[8:]
+ elif url and url.startswith("http://http://"):
+ url = url[7:]
+ if url:
+ # print(url, file=sys.stderr)
+ resources.append(dict(url=url.strip(), type=type_name))
+
+ return resources
+
+
+def html_extract_resources(
+ doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker
+) -> List[Dict[str, str]]:
+ """
+ This function tries to find all the important resources in a page. The
+ presumption is that the HTML document is article fulltext, and we want the
+ list of all resources (by URL) necessary to replay the page.
+
+ The returned resource URLs each have a type (script, img, css, etc), and
+ should be fully-qualified URLs (not relative).
+
+ Adblock filtering is run to remove unwanted resources.
+ """
+ resources = []
+
+ # select various resource references
+ resources += _extract_generic(doc, "script", ["src"], "script")
+ resources += _extract_generic(doc, "link[rel='stylesheet']", ["href"], "stylesheet")
+ # TODO: srcset and parse
+ # eg: https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w
+ resources += _extract_generic(doc, "img", ["src"], "image")
+ resources += _extract_generic(doc, "audio", ["src"], "audio")
+ resources += _extract_generic(doc, "video", ["src"], "media")
+ resources += _extract_generic(doc, "source", ["src"], "media")
+ resources += _extract_generic(doc, "track", ["src"], "media")
+ resources += _extract_generic(doc, "iframe", ["src"], "subdocument")
+ resources += _extract_generic(doc, "embed", ["src"], "media")
+
+ # ensure URLs are absolute
+ for r in resources:
+ r["url"] = urllib.parse.urljoin(doc_url, r["url"])
+
+ # filter using adblocker
+ resources = [
+ r
+ for r in resources
+ if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"])
+ is False
+ ]
+
+ # remove duplicates
+ resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
+
+ return resources
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 49f5ad4..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1,24 +1,31 @@
-
# XXX: some broken MRO thing going on in here due to python3 object wrangling
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os, sys, time
-import requests
import datetime
+import gzip
+import http.client
+import json
+import os
+import sys
+import time
+import urllib.parse
from collections import namedtuple
+from http.client import IncompleteRead
+from typing import Any, Dict, List, Optional, Tuple, Union
-import http.client
+import requests
+import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
-from http.client import IncompleteRead
+from gwb.loader import CDXLoaderFactory3
from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
+
class SandcrawlerBackoffError(Exception):
"""
@@ -27,62 +34,78 @@ class SandcrawlerBackoffError(Exception):
be passed up through any timeout/retry code and become an actual long pause
or crash.
"""
+
pass
-ResourceResult = namedtuple("ResourceResult", [
- "start_url",
- "hit",
- "status",
- "terminal_url",
- "terminal_dt",
- "terminal_status_code",
- "body",
- "cdx",
- "revisit_cdx",
-])
-
-WarcResource = namedtuple("WarcResource", [
- "status_code",
- "location",
- "body",
- "revisit_cdx",
-])
-
-CdxRow = namedtuple('CdxRow', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
- 'warc_csize',
- 'warc_offset',
- 'warc_path',
-])
-
-CdxPartial = namedtuple('CdxPartial', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
-])
-
-def cdx_partial_from_row(full):
+
+ResourceResult = namedtuple(
+ "ResourceResult",
+ [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+ "revisit_cdx",
+ ],
+)
+
+WarcResource = namedtuple(
+ "WarcResource",
+ [
+ "status_code",
+ "location",
+ "body",
+ "revisit_cdx",
+ ],
+)
+
+CdxRow = namedtuple(
+ "CdxRow",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ "warc_csize",
+ "warc_offset",
+ "warc_path",
+ ],
+)
+
+CdxPartial = namedtuple(
+ "CdxPartial",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ ],
+)
+
+
+def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial:
return CdxPartial(
- surt=full.surt,
- datetime=full.datetime,
- url=full.url,
- mimetype=full.mimetype,
- status_code=full.status_code,
- sha1b32=full.sha1b32,
- sha1hex=full.sha1hex,
+ surt=row.surt,
+ datetime=row.datetime,
+ url=row.url,
+ mimetype=row.mimetype,
+ status_code=row.status_code,
+ sha1b32=row.sha1b32,
+ sha1hex=row.sha1hex,
)
-def cdx_to_dict(cdx):
+
+def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]:
d = {
"surt": cdx.surt,
"datetime": cdx.datetime,
@@ -92,61 +115,82 @@ def cdx_to_dict(cdx):
"sha1b32": cdx.sha1b32,
"sha1hex": cdx.sha1hex,
}
- if type(cdx) == CdxRow and '/' in cdx.warc_path:
- d['warc_csize'] = cdx.warc_csize
- d['warc_offset'] = cdx.warc_offset
- d['warc_path'] = cdx.warc_path
+ if type(cdx) == CdxRow and "/" in cdx.warc_path:
+ d["warc_csize"] = cdx.warc_csize
+ d["warc_offset"] = cdx.warc_offset
+ d["warc_path"] = cdx.warc_path
return d
-def fuzzy_match_url(left, right):
+
+def fuzzy_match_url(left: str, right: str) -> bool:
"""
Matches URLs agnostic of http/https (and maybe other normalizations in the
future)
"""
if left == right:
return True
- if '://' in left and '://' in right:
- if left.split('://')[1:] == right.split('://')[1:]:
- return True
+ if "://" in left and "://" in right:
+ left = "://".join(left.split("://")[1:])
+ right = "://".join(right.split("://")[1:])
+ if left == right:
+ return True
+ if left == right + "/" or right == left + "/":
+ return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
return False
-def test_fuzzy_match_url():
- assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+
+def test_fuzzy_match_url() -> None:
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
# should probably handle these?
- assert fuzzy_match_url("http://thing.com", "http://thing.com/") == False
- assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False
+
class CdxApiError(Exception):
pass
-class CdxApiClient:
- def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
+class CdxApiClient:
+ def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- cdx_auth_token = kwargs.get('cdx_auth_token',
- os.environ.get('CDX_AUTH_TOKEN'))
+ cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN"))
if not cdx_auth_token:
- raise Exception("CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
- self.http_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
- 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
- })
+ raise Exception(
+ "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)"
+ )
+ self.http_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient",
+ "Cookie": "cdx_auth_token={}".format(cdx_auth_token),
+ }
+ )
- def _query_api(self, params):
+ def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]:
"""
Hits CDX API with a query, parses result into a list of CdxRow
"""
resp = self.http_session.get(self.host_url, params=params)
if resp.status_code != 200:
raise CdxApiError(resp.text)
- #print(resp.url, file=sys.stderr)
+ # print(resp.url, file=sys.stderr)
if not resp.text:
return None
rj = resp.json()
@@ -154,8 +198,10 @@ class CdxApiClient:
return None
rows = []
for raw in rj[1:]:
- assert len(raw) == 11 # JSON is short
- #print(raw, file=sys.stderr)
+ # check number of CDX fields; there is a bug with some rows having
+ # spaces in WARC filename resulting in extra bogus fields
+ if len(raw) != 11:
+ raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected")
# transform "-" ftp status code to a 226
status_code = None
@@ -165,8 +211,17 @@ class CdxApiClient:
else:
status_code = int(raw[4])
- # CDX rows with no WARC records?
- if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+ # remove CDX rows with no WARC records (?)
+ if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
+ continue
+
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
continue
row = CdxRow(
@@ -185,43 +240,75 @@ class CdxApiClient:
rows.append(row)
return rows
- def fetch(self, url, datetime, filter_status_code=None, retry_sleep=None):
+ def fetch(
+ self,
+ url: str,
+ datetime: str,
+ filter_status_code: Optional[int] = None,
+ retry_sleep: Optional[int] = None,
+ ) -> CdxRow:
"""
Fetches a single CDX row by url/datetime. Raises a KeyError if not
found, because we expect to be looking up a specific full record.
"""
if len(datetime) != 14:
- raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
- params = {
- 'url': url,
- 'from': datetime,
- 'to': datetime,
- 'matchType': 'exact',
- 'limit': 1,
- 'output': 'json',
+ raise ValueError(
+ "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)
+ )
+ params: Dict[str, str] = {
+ "url": url,
+ "from": datetime,
+ "to": datetime,
+ "matchType": "exact",
+ "limit": "1",
+ "output": "json",
}
if filter_status_code:
- params['filter'] = "statuscode:{}".format(filter_status_code)
+ params["filter"] = "statuscode:{}".format(filter_status_code)
resp = self._query_api(params)
if not resp:
- if retry_sleep:
- print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ if retry_sleep and retry_sleep > 0:
+ next_sleep = None
+ if retry_sleep > 3:
+ next_sleep = retry_sleep - 3
+ retry_sleep = 3
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep
+ )
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
# allow fuzzy http/https match
if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
- if retry_sleep:
- print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ if retry_sleep and retry_sleep > 0:
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
- raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=None
+ )
+ raise KeyError(
+ "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(
+ url, datetime, row
+ )
+ )
if filter_status_code:
assert row.status_code == filter_status_code
return row
- def lookup_best(self, url, max_age_days=None, best_mimetype=None):
+ def lookup_best(
+ self,
+ url: str,
+ max_age_days: Optional[int] = None,
+ best_mimetype: Optional[str] = None,
+ closest: Union[datetime.datetime, str, None] = None,
+ ) -> Optional[CdxRow]:
"""
Fetches multiple CDX rows for the given URL, tries to find the most recent.
@@ -244,38 +331,50 @@ class CdxApiClient:
most-recent
"""
- params = {
- 'url': url,
- 'matchType': 'exact',
- 'limit': -25,
- 'output': 'json',
+ params: Dict[str, str] = {
+ "url": url,
+ "matchType": "exact",
+ "limit": "-40",
+ "output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
#'collapse': 'timestamp:6',
-
# Revisits now allowed and resolved!
#'filter': '!mimetype:warc/revisit',
}
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
- params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
+ params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
+ if closest:
+ if isinstance(closest, datetime.datetime):
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
+ else:
+ closest_dt = closest
+ params["closest"] = closest_dt
+ params["sort"] = "closest"
+ # print(params, file=sys.stderr)
rows = self._query_api(params)
if not rows:
return None
- def _cdx_sort_key(r):
+ def _cdx_sort_key(r: CdxRow) -> tuple:
"""
This is a function, not a lambda, because it captures
best_mimetype. Will create a tuple that can be used to sort in
*reverse* order.
"""
return (
+ int(r.url == url),
int(r.status_code in (200, 226)),
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
- int('/' in r.warc_path),
+ r.datetime[:4] == closest_dt[:4],
int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
)
rows = sorted(rows, key=_cdx_sort_key)
@@ -285,33 +384,48 @@ class CdxApiClient:
class WaybackError(Exception):
pass
+
+class WaybackContentError(Exception):
+ pass
+
+
class PetaboxError(Exception):
pass
-class WaybackClient:
- def __init__(self, cdx_client=None, **kwargs):
+class NoCaptureError(Exception):
+ pass
+
+
+class WaybackClient:
+ def __init__(self, cdx_client: Optional[CdxApiClient] = None, **kwargs):
if cdx_client:
self.cdx_client = cdx_client
else:
self.cdx_client = CdxApiClient()
# /serve/ instead of /download/ doesn't record view count
# this *does* want to be http://, not https://
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
self.petabox_webdata_secret = kwargs.get(
- 'petabox_webdata_secret',
- os.environ.get('PETABOX_WEBDATA_SECRET'),
+ "petabox_webdata_secret",
+ os.environ.get("PETABOX_WEBDATA_SECRET"),
)
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/")
self.rstore = None
self.max_redirects = 25
self.wayback_endpoint = "https://web.archive.org/web/"
self.replay_headers = {
- 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
}
+ self.http_session = requests_retry_session()
+ self.record_http_session = requests_retry_session(
+ status_forcelist=[],
+ )
- def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True):
+ def fetch_petabox(
+ self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
+ ) -> WarcResource:
"""
Fetches wayback resource directly from petabox using WARC path/offset/csize.
@@ -334,28 +448,56 @@ class WaybackClient:
"""
if not self.petabox_webdata_secret:
raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
- if not "/" in warc_path:
- raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
+ if "/" not in warc_path:
+ raise ValueError(
+ "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)
+ )
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory3(
+ webdata_secret=self.petabox_webdata_secret,
+ )
+ )
+ assert self.rstore
try:
- #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
+ # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
- print("Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
- raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
+ except wayback.exception.InvalidResource:
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ raise WaybackContentError(
+ "failed to load file contents from wayback/petabox (InvalidResource)"
+ )
+ except urllib3.exceptions.ReadTimeoutError as rte:
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(
+ rte
+ )
+ )
except ValueError as ve:
- raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)
+ )
except EOFError as eofe:
- raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)
+ )
except TypeError as te:
- raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ )
+ )
except Exception as e:
if "while decompressing data: invalid block type" in str(e):
- raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+ raise PetaboxError(
+ "decompression error fetching WARC record; usually due to bad alexa ARC files"
+ )
else:
raise e
# Note: could consider a generic "except Exception" here, as we get so
@@ -365,10 +507,14 @@ class WaybackClient:
try:
status_code = gwb_record.get_status()[0]
except http.client.HTTPException:
- raise WaybackError("too many HTTP headers (in wayback fetch)")
+ raise WaybackContentError("too many HTTP headers (in wayback fetch)")
location = gwb_record.get_location() or None
- if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit():
+ if (
+ status_code is None
+ and gwb_record.target_uri.startswith(b"ftp://")
+ and not gwb_record.is_revisit()
+ ):
# TODO: some additional verification here?
status_code = 226
@@ -376,37 +522,47 @@ class WaybackClient:
revisit_cdx = None
if gwb_record.is_revisit():
if not resolve_revisit:
- raise WaybackError("found revisit record, but won't resolve (loop?)")
+ raise WaybackContentError("found revisit record, but won't resolve (loop?)")
revisit_uri, revisit_dt = gwb_record.refers_to
if not (revisit_uri and revisit_dt):
- raise WaybackError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
- warc_path, offset))
+ raise WaybackContentError(
+ "revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ warc_path, offset
+ )
+ )
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)
- revisit_uri = revisit_uri.decode('utf-8')
- revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ if type(revisit_uri) is bytes:
+ revisit_uri = revisit_uri.decode("utf-8")
+ if type(revisit_dt) is bytes:
+ revisit_dt = revisit_dt.decode("utf-8")
+ revisit_dt = (
+ revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "")
+ )
assert len(revisit_dt) == 14
try:
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
+ body = self.fetch_petabox_body(
+ csize=revisit_cdx.warc_csize,
+ offset=revisit_cdx.warc_offset,
+ warc_path=revisit_cdx.warc_path,
+ resolve_revisit=False,
+ expected_status_code=revisit_cdx.status_code,
+ )
except KeyError as ke:
raise WaybackError("Revist resolution failed: {}".format(ke))
- body = self.fetch_petabox_body(
- csize=revisit_cdx.warc_csize,
- offset=revisit_cdx.warc_offset,
- warc_path=revisit_cdx.warc_path,
- resolve_revisit=False,
- expected_status_code=revisit_cdx.status_code,
- )
elif status_code in (200, 226):
try:
body = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
raise WaybackError(
- "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ )
+ )
elif status_code is None:
- raise WaybackError(
- "got a None status_code in (W)ARC record")
+ raise WaybackContentError("got a None status_code in (W)ARC record")
return WarcResource(
status_code=status_code,
location=location,
@@ -414,7 +570,14 @@ class WaybackClient:
revisit_cdx=revisit_cdx,
)
- def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None):
+ def fetch_petabox_body(
+ self,
+ csize: int,
+ offset: int,
+ warc_path: str,
+ resolve_revisit: bool = True,
+ expected_status_code: Optional[int] = None,
+ ) -> bytes:
"""
Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
@@ -431,19 +594,22 @@ class WaybackClient:
if expected_status_code:
if expected_status_code != resource.status_code:
- raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
- expected_status_code,
- resource.status_code,
+ raise KeyError(
+ "archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
)
)
elif resource.status_code not in (200, 226):
- raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
- resource.status_code)
+ raise KeyError(
+ "archived HTTP response (WARC) was not 200: {}".format(resource.status_code)
)
return resource.body
- def fetch_replay_body(self, url, datetime, cdx_sha1hex=None):
+ def fetch_replay_body(
+ self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None
+ ) -> bytes:
"""
Fetches an HTTP 200 record from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -464,46 +630,59 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = requests.get(
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
- raise WaybackError("redirect loop (wayback replay fetch)")
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
-
- try:
- resp.raise_for_status()
- except Exception as e:
- raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
# defensively check that this is actually correct replay based on headers
- if not "X-Archive-Src" in resp.headers:
+ if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
if cdx_sha1hex:
# verify that body matches CDX hash
# TODO: don't need *all* these hashes, just sha1
file_meta = gen_file_metadata(resp.content)
- if cdx_sha1hex != file_meta['sha1hex']:
- print("REPLAY MISMATCH: cdx:{} replay:{}".format(
- cdx_sha1hex,
- file_meta['sha1hex']),
- file=sys.stderr)
- raise WaybackError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
- cdx_sha1hex,
- file_meta['sha1hex']),
+ if cdx_sha1hex != file_meta["sha1hex"]:
+ print(
+ " REPLAY MISMATCH: cdx:{} replay:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ file=sys.stderr,
+ )
+ raise WaybackContentError(
+ "replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
)
return resp.content
- def fetch_replay_redirect(self, url, datetime):
+ def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]:
"""
Fetches an HTTP 3xx redirect Location from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -520,41 +699,65 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = requests.get(
+ # when fetching via `id_`, it is possible to get a 5xx error which
+ # is either a wayback error, or an actual replay of an upstream 5xx
+ # error. the exception control flow here is tweaked, and a
+ # different HTTP session is used, to try and differentiate between
+ # the two cases
+ resp = None
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
+ resp.raise_for_status()
except requests.exceptions.TooManyRedirects:
- raise WaybackError("redirect loop (wayback replay fetch)")
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
- try:
- resp.raise_for_status()
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
except Exception as e:
+ if resp is not None and "X-Archive-Src" in resp.headers:
+ raise WaybackContentError(
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
+ )
raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if not "X-Archive-Src" in resp.headers:
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
redirect_url = resp.headers.get("Location")
# eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("http"):
redirect_url = clean_url(redirect_url)
return redirect_url
else:
return None
- def lookup_resource(self, start_url, best_mimetype=None):
+ def lookup_resource(
+ self,
+ start_url: str,
+ best_mimetype: Optional[str] = None,
+ closest: Union[str, datetime.datetime, None] = None,
+ ) -> ResourceResult:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. Returns a ResourceResult object, which may indicate a
@@ -580,16 +783,18 @@ class WaybackClient:
"""
next_url = start_url
urls_seen = [start_url]
- for i in range(self.max_redirects):
+ for i in range(self.max_redirects + 1):
print(" URL: {}".format(next_url), file=sys.stderr)
- cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype)
- #print(cdx_row, file=sys.stderr)
- if not cdx_row:
+ next_row: Optional[CdxRow] = self.cdx_client.lookup_best(
+ next_url, best_mimetype=best_mimetype, closest=closest
+ )
+ # print(next_row, file=sys.stderr)
+ if not next_row:
return ResourceResult(
start_url=start_url,
hit=False,
status="no-capture",
- terminal_url=None,
+ terminal_url=next_url,
terminal_dt=None,
terminal_status_code=None,
body=None,
@@ -597,8 +802,10 @@ class WaybackClient:
revisit_cdx=None,
)
+ cdx_row: CdxRow = next_row
+
# first try straight-forward redirect situation
- if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path:
+ if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -611,15 +818,17 @@ class WaybackClient:
status="success",
terminal_url=cdx_row.url,
terminal_dt=cdx_row.datetime,
- terminal_status_code=resource.revisit_cdx.status_code, # ?
+ terminal_status_code=resource.revisit_cdx.status_code,
body=resource.body,
cdx=cdx_row,
revisit_cdx=resource.revisit_cdx,
)
+ # else, continue processing with revisit record
if cdx_row.status_code in (200, 226):
revisit_cdx = None
- if '/' in cdx_row.warc_path:
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -632,7 +841,7 @@ class WaybackClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- cdx_row = cdx_partial_from_row(cdx_row)
+ final_cdx = cdx_partial_from_row(cdx_row)
return ResourceResult(
start_url=start_url,
hit=True,
@@ -641,11 +850,11 @@ class WaybackClient:
terminal_dt=cdx_row.datetime,
terminal_status_code=cdx_row.status_code,
body=body,
- cdx=cdx_row,
+ cdx=final_cdx,
revisit_cdx=revisit_cdx,
)
elif 300 <= (cdx_row.status_code or 0) < 400:
- if '/' in cdx_row.warc_path:
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -654,7 +863,7 @@ class WaybackClient:
)
assert 300 <= resource.status_code < 400
if not resource.location:
- print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -666,24 +875,23 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- if resource.location.startswith('/'):
- # redirect location does not include hostname
- domain_prefix = '/'.join(next_url.split('/')[:3])
- next_url = domain_prefix + resource.location
+ if "://" not in resource.location:
+ next_url = urllib.parse.urljoin(next_url, resource.location)
else:
next_url = resource.location
if next_url:
next_url = clean_url(next_url)
else:
- next_url = self.fetch_replay_redirect(
+ redirect_url = self.fetch_replay_redirect(
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- if next_url:
- next_url = clean_url(next_url)
- cdx_row = cdx_partial_from_row(cdx_row)
- if not next_url:
- print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ if redirect_url:
+ redirect_url = clean_url(redirect_url)
+ if redirect_url:
+ next_url = redirect_url
+ else:
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -721,6 +929,7 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
+
return ResourceResult(
start_url=start_url,
hit=False,
@@ -737,39 +946,72 @@ class WaybackClient:
class SavePageNowError(Exception):
pass
+
class SavePageNowBackoffError(SandcrawlerBackoffError):
pass
-SavePageNowResult = namedtuple('SavePageNowResult', [
- 'success',
- 'status',
- 'job_id',
- 'request_url',
- 'terminal_url',
- 'terminal_dt',
- 'resources',
-])
-class SavePageNowClient:
+SavePageNowResult = namedtuple(
+ "SavePageNowResult",
+ [
+ "success",
+ "status",
+ "job_id",
+ "request_url",
+ "terminal_url",
+ "terminal_dt",
+ "resources",
+ ],
+)
+
- def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs):
- self.ia_access_key = kwargs.get('ia_access_key',
- os.environ.get('IA_ACCESS_KEY'))
- self.ia_secret_key = kwargs.get('ia_secret_key',
- os.environ.get('IA_SECRET_KEY'))
+class SavePageNowClient:
+ def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs):
+ self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
+ self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
self.v2endpoint = v2endpoint
- self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
- self.v2_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient',
- 'Accept': 'application/json',
- 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key),
- })
+ self.v2_session = requests_retry_session(
+ retries=5, backoff_factor=3, status_forcelist=[502, 504]
+ )
+ self.v2_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
+ "Accept": "application/json",
+ "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key),
+ }
+ )
# 3 minutes total
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url, force_get=0, capture_outlinks=0):
+ self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0)
+
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ def save_url_now_v2(
+ self,
+ request_url: str,
+ force_simple_get: Optional[int] = None,
+ capture_outlinks: int = 0,
+ ) -> SavePageNowResult:
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -790,7 +1032,7 @@ class SavePageNowClient:
non-200 remote statuses, invalid hosts/URLs, timeouts, backoff, etc.
"""
if capture_outlinks:
- print(" capturing outlinks!", file=sys.stdout)
+ print(" capturing outlinks!", file=sys.stderr)
if not (self.ia_access_key and self.ia_secret_key):
raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
if request_url.startswith("ftp://"):
@@ -803,79 +1045,163 @@ class SavePageNowClient:
None,
None,
)
- resp = self.v2_session.post(
- self.v2endpoint,
- data={
- 'url': request_url,
- 'capture_all': 1,
- 'capture_outlinks': capture_outlinks,
- 'capture_screenshot': 0,
- 'if_not_archived_within': '1d',
- 'force_get': force_get,
- },
- )
+ if force_simple_get is None:
+ force_simple_get = 0
+ for domain in self.simple_get_domains:
+ if domain in request_url:
+ force_simple_get = 1
+ break
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get(f"{self.v2endpoint}/status/user")
if resp.status_code == 429:
- raise SavePageNowBackoffError("status_code: {}, url: {}".format(resp.status_code, request_url))
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
elif resp.status_code != 200:
- raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url))
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ "status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(
+ "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
+ resp.raise_for_status()
resp_json = resp.json()
- if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json['message']:
- raise SavePageNowBackoffError(resp_json['message'])
- elif not resp_json or 'job_id' not in resp_json:
+ if (
+ resp_json
+ and "message" in resp_json
+ and "You have already reached the limit of active sessions" in resp_json["message"]
+ ):
+ raise SavePageNowBackoffError(resp_json["message"])
+ elif (
+ resp_json
+ and "message" in resp_json
+ and "The same snapshot had been made" in resp_json["message"]
+ ):
+ return SavePageNowResult(
+ False,
+ "spn2-recent-capture",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif resp_json.get("status") == "error":
+ return SavePageNowResult(
+ False,
+ resp_json.get("status_ext") or resp_json["status"],
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
raise SavePageNowError(
- "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json))
+ "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
+ )
- job_id = resp_json['job_id']
+ job_id = resp_json["job_id"]
+ print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
# poll until complete
final_json = None
for i in range(self.poll_count):
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id']))
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id))
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
- status = resp.json()['status']
- if status == 'pending':
+ status = resp.json()["status"]
+ if status == "pending":
time.sleep(self.poll_seconds)
- elif status in ('success', 'error'):
+ elif status in ("success", "error"):
final_json = resp.json()
break
else:
- raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url))
+ raise SavePageNowError(
+ "Unknown SPN2 status:{} url:{}".format(status, request_url)
+ )
if not final_json:
raise SavePageNowError("SPN2 timed out (polling count exceeded)")
# if there was a recent crawl of same URL, fetch the status of that
# crawl to get correct datetime
- if final_json.get('original_job_id'):
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))
+ if final_json.get("original_job_id"):
+ print(
+ f" SPN recent capture: {job_id} -> {final_json['original_job_id']}",
+ file=sys.stderr,
+ )
+ resp = self.v2_session.get(
+ "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"])
+ )
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
final_json = resp.json()
- #print(final_json, file=sys.stderr)
+ # print(final_json, file=sys.stderr)
- if final_json['status'] == "success":
+ if final_json["status"] == "success":
+ if final_json.get("original_url").startswith("/"):
+ print(
+ f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
+ file=sys.stderr,
+ )
return SavePageNowResult(
True,
"success",
job_id,
request_url,
- final_json['original_url'],
- final_json['timestamp'],
- final_json['resources'],
+ final_json["original_url"],
+ final_json["timestamp"],
+ final_json.get("resources") or None,
)
else:
- if final_json['status'] == 'pending':
- final_json['status'] = 'error:pending'
+ if final_json["status"] == "pending":
+ final_json["status"] = "error:pending"
return SavePageNowResult(
False,
- final_json.get('status_ext') or final_json['status'],
+ final_json.get("status_ext") or final_json["status"],
job_id,
request_url,
None,
@@ -883,30 +1209,53 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client, force_get=0):
+ def crawl_resource(
+ self,
+ start_url: str,
+ wayback_client: WaybackClient,
+ force_simple_get: Optional[int] = None,
+ ) -> ResourceResult:
"""
- Runs a SPN2 crawl, then fetches body from wayback.
+ Runs a SPN2 crawl, then fetches body.
- TODO: possible to fetch from petabox?
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
"""
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
- if 'gzbd.cnki.net/' in start_url:
- spn_result = self.save_url_now_v2(start_url, force_get=force_get, capture_outlinks=1)
+ if "gzbd.cnki.net/" in start_url:
+ spn_result = self.save_url_now_v2(
+ start_url, force_simple_get=force_simple_get, capture_outlinks=1
+ )
else:
- spn_result = self.save_url_now_v2(start_url, force_get=force_get)
+ spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
if not spn_result.success:
status = spn_result.status
- if status in ("error:invalid-url", "error:not-found",
- "error:invalid-host-resolution", "error:gateway-timeout"):
+ if status in (
+ "error:invalid-url",
+ "error:not-found",
+ "error:invalid-host-resolution",
+ "error:gateway-timeout",
+ "error:too-many-redirects",
+ "error:read-timeout",
+ ):
status = status.replace("error:", "")
- elif status == "error:no-access":
+ elif status in ("error:no-access", "error:forbidden"):
status = "forbidden"
elif status == "error:user-session-limit":
raise SavePageNowBackoffError("SPNv2 user-session-limit")
+ elif status == "error:internal-server-error":
+ status = "remote-server-error"
elif status.startswith("error:"):
status = "spn2-" + status
+ # despite other errors, call these a failure (so we don't retry)
+ if spn_result.terminal_url and (
+ spn_result.terminal_url.endswith("/cookieAbsent")
+ or spn_result.terminal_url.endswith("cookieSet=1")
+ ):
+ status = "blocked-cookie"
return ResourceResult(
start_url=start_url,
hit=False,
@@ -918,9 +1267,39 @@ class SavePageNowClient:
cdx=None,
revisit_cdx=None,
)
- #print(spn_result, file=sys.stderr)
+ # print(spn_result, file=sys.stderr)
- cdx_row = None
+ # detect partial URL response (aka, success, but missing full URL)
+ if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-success-partial-url",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ # don't try to CDX fetch for this common cookie block terminal
+ if spn_result.terminal_url.endswith(
+ "/cookieAbsent"
+ ) or spn_result.terminal_url.endswith("cookieSet=1"):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="blocked-cookie",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ cdx_row: Optional[CdxRow] = None
# hack to work around elsevier weirdness
if "://pdf.sciencedirectassets.com/" in spn_result.request_url:
elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
@@ -928,26 +1307,35 @@ class SavePageNowClient:
best_mimetype="application/pdf",
)
if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf":
- print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ print(" Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
cdx_row = elsevier_pdf_cdx
else:
- print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
- #print(elsevier_pdf_cdx, file=sys.stderr)
+ print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ # print(elsevier_pdf_cdx, file=sys.stderr)
if not cdx_row:
# lookup exact
try:
- filter_status_code = 200
+ filter_status_code = None
if spn_result.terminal_url.startswith("ftp://"):
filter_status_code = 226
cdx_row = wayback_client.cdx_client.fetch(
url=spn_result.terminal_url,
datetime=spn_result.terminal_dt,
filter_status_code=filter_status_code,
- retry_sleep=10.0,
+ retry_sleep=self.spn_cdx_retry_sec,
)
+ # sometimes there are fuzzy http/https self-redirects with the
+ # same SURT; try to work around that
+ if cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
except KeyError as ke:
- print("CDX KeyError: {}".format(ke), file=sys.stderr)
+ print(" CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -960,10 +1348,11 @@ class SavePageNowClient:
revisit_cdx=None,
)
- #print(cdx_row, file=sys.stderr)
+ # print(cdx_row, file=sys.stderr)
revisit_cdx = None
- if '/' in cdx_row.warc_path:
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
# Usually can't do this kind of direct fetch because CDX result is recent/live
resource = wayback_client.fetch_petabox(
csize=cdx_row.warc_csize,
@@ -976,22 +1365,82 @@ class SavePageNowClient:
revisit_cdx = resource.revisit_cdx
else:
# note: currently not trying to verify cdx_row.sha1hex
- body = wayback_client.fetch_replay_body(
- url=cdx_row.url,
- datetime=cdx_row.datetime,
- )
+ try:
+ body = wayback_client.fetch_replay_body(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ except (WaybackError, WaybackContentError):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-wayback-error",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
# warc_path etc will change, so strip them out
- cdx_row = cdx_partial_from_row(cdx_row)
+ final_cdx = cdx_partial_from_row(cdx_row)
- return ResourceResult(
- start_url=start_url,
- hit=True,
- status="success",
- terminal_url=cdx_row.url,
- terminal_dt=cdx_row.datetime,
- terminal_status_code=cdx_row.status_code,
- body=body,
- cdx=cdx_row,
- revisit_cdx=revisit_cdx,
- )
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+
+def fix_transfer_encoding(
+ file_meta: dict, resource: ResourceResult
+) -> Tuple[dict, ResourceResult]:
+ if (
+ resource.body
+ and file_meta["mimetype"] == "application/gzip"
+ and resource.cdx
+ and resource.cdx.mimetype != "application/gzip"
+ ):
+ print(
+ " transfer encoding not stripped: {}".format(resource.cdx.mimetype),
+ file=sys.stderr,
+ )
+ inner_body = gzip.decompress(resource.body)
+ if not inner_body:
+ raise Exception("null body inside transfer encoding")
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
deleted file mode 100644
index f4e78e4..0000000
--- a/python/sandcrawler/ingest.py
+++ /dev/null
@@ -1,446 +0,0 @@
-
-import sys
-import json
-import gzip
-import base64
-import requests
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from collections import namedtuple
-
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
-from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata, clean_url
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
-
-
-class IngestFileWorker(SandcrawlerWorker):
- """
- High level flow is to look in history first, then go to live web if
- resource not found. Following redirects is treated as "fetching a
- resource". Current version fetches a single resource; if it isn't a hit
- but is an HTML 200, treats it as a landing page, tries to extract
- fulltext link, then fetches that resource.
-
- process(request, key=None) -> response
- Does all the things!
-
- Check existing processing (short circuit):
-
- check_existing_ingest(base_url) -> ingest_file_result or none
- process_existing(result) -> response
- try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()
-
- Fetch resource:
-
- find_resource(url) -> ResourceResult
-
- Process resource:
-
- process_hit(ResourceResult) -> response
- process_grobid(ResourceResult)
- """
-
- def __init__(self, sink=None, **kwargs):
- super().__init__()
-
- self.sink = sink
- self.wayback_client = kwargs.get('wayback_client')
- if not self.wayback_client:
- self.wayback_client = WaybackClient()
- self.spn_client = kwargs.get('spn_client')
- if not self.spn_client:
- self.spn_client = SavePageNowClient()
- self.grobid_client = kwargs.get('grobid_client')
- if not self.grobid_client:
- self.grobid_client = GrobidClient()
- self.pgrest_client = kwargs.get('pgrest_client')
- if not self.pgrest_client:
- self.pgrest_client = SandcrawlerPostgrestClient()
- self.grobid_sink = kwargs.get('grobid_sink')
-
- self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
- self.try_existing_grobid = kwargs.get('try_existing_grobid', True)
- self.try_wayback = kwargs.get('try_wayback', True)
- self.try_spn2 = kwargs.get('try_spn2', True)
-
- self.base_url_blocklist = [
- # temporary, until we implement specific fetch and 'petabox' output
- "://archive.org/",
- "://web.archive.org/web/",
- "://openlibrary.org/",
- "://fatcat.wiki/",
-
- # Domain squats
- "://bartandjones.com",
- "://ijretm.com",
- "://ijrcemas.com",
- "://jist.net.in",
- "://croisements-revue.org",
-
- # all stubs/previews, not full papers
- "://page-one.live.cf.public.springer.com",
-
- # large datasets-only (no PDF expected)
- "plutof.ut.ee/",
- "www.gbif.org/",
- "doi.pangaea.de/",
- "www.plate-archive.org/",
- "://doi.org/10.25642/ipk/gbis/",
- "://apex.ipk-gatersleben.de/",
-
- # Historical non-paper content:
- "dhz.uni-passau.de/", # newspapers
- "digital.ucd.ie/", # ireland national historical
- ]
-
- # these are special-case web domains for which we want SPN2 to not run
- # a headless browser (brozzler), but instead simply run wget.
- # the motivation could be to work around browser issues, or in the
- # future possibly to increase download efficiency (wget/fetch being
- # faster than browser fetch)
- self.spn2_simple_get_domains = [
- ]
-
-
- def check_existing_ingest(self, base_url):
- """
- Check in sandcrawler-db (postgres) to see if we have already ingested
- this URL (ingest file result table).
-
- Returns existing row *if* found *and* we should use it, otherwise None.
-
- Looks at existing ingest results and makes a decision based on, eg,
- status and timestamp.
- """
- if not self.try_existing_ingest:
- return None
- existing = self.pgrest_client.get_ingest_file_result(base_url)
- # TODO: filter on more flags?
- if existing and existing['hit'] == True:
- return existing
- else:
- return None
-
- def find_resource(self, url, best_mimetype=None, force_recrawl=False):
- """
- Looks in wayback for a resource starting at the URL, following any
- redirects. If a hit isn't found, try crawling with SPN.
- """
- via = "none"
- resource = None
-
- if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"):
- raise NotImplementedError("handling direct wayback links not supported yet")
-
- if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
- raise NotImplementedError("fetching from archive.org not implemented yet")
-
- if self.try_wayback and not force_recrawl:
- via = "wayback"
- resource = self.wayback_client.lookup_resource(url, best_mimetype)
-
- # check for "soft 404" conditions, where we should retry with live SPNv2
- # TODO: could refactor these into the resource fetch things themselves?
- soft404 = False
- if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
- soft404 = True
-
- if self.try_spn2 and (not resource or not resource.hit or soft404):
- via = "spn2"
- force_get = 0
- for domain in self.spn2_simple_get_domains:
- if domain in url:
- force_get = 1
- break
- resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get)
- print("[FETCH {}\t] {}\t{}".format(
- via,
- resource.status,
- resource.terminal_url or url),
- file=sys.stderr)
- return resource
-
- def process_existing(self, request, result_row):
- """
- If we have an existing ingest file result, do any database fetches or
- additional processing necessary to return a result.
- """
- raise NotImplementedError("process_existing() not tested or safe yet")
- assert result_row['hit']
- existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
- if not (existing_file_meta and existing_grobid and existing_cdx):
- raise NotImplementedError("partially-exsiting records not implemented yet")
- result = {
- 'hit': result_row['hit'],
- 'status': "existing",
- 'request': request,
- 'grobid': existing_grobid,
- 'file_meta': existing_file_meta,
- 'cdx': existing_cdx,
- 'terminal': {
- 'terminal_url': result_row['terminal_url'],
- 'terminal_dt': result_row['terminal_dt'],
- 'terminal_status_code': result_row['terminal_status_code'],
- 'terminal_sha1hex': result_row['terminal_sha1hex'],
- },
- }
- return result
-
- def process_hit(self, resource, file_meta):
- """
- Run all the necessary processing for a new/fresh ingest hit.
- """
- return {
- 'grobid': self.process_grobid(resource, file_meta),
- }
-
- def process_grobid(self, resource, file_meta):
- """
- Submits to resource body to GROBID for processing.
-
- TODO: By default checks sandcrawler-db for an existing row first, then
- decide if we should re-process
- """
- if self.try_existing_grobid:
- existing = self.pgrest_client.get_grobid(file_meta['sha1hex'])
- if existing:
- print("found existing GROBID result", file=sys.stderr)
- return existing
-
- # Need to actually processes
- result = self.grobid_client.process_fulltext(resource.body)
- if self.grobid_sink:
- # extra fields for GROBID kafka messages
- result['file_meta'] = file_meta
- result['key'] = result['file_meta']['sha1hex']
- self.grobid_sink.push_record(result.copy())
- if result['status'] == "success":
- metadata = self.grobid_client.metadata(result)
- if metadata:
- result['metadata'] = self.grobid_client.metadata(result)
- result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
- result['grobid_version'] = result['metadata'].pop('grobid_version', None)
- result.pop('tei_xml', None)
- result.pop('file_meta', None)
- result.pop('key', None)
- return result
-
- def timeout_response(self, task):
- print("[TIMEOUT]", file=sys.stderr)
- return dict(
- request=task,
- hit=False,
- status="timeout",
- error_message="ingest worker internal timeout",
- )
-
- def want(self, request):
- if not request.get('ingest_type') in ('file', 'pdf'):
- return False
- return True
-
- def process(self, request, key=None):
-
- # backwards compatibility
- if request.get('ingest_type') in ('file', None):
- request['ingest_type'] = 'pdf'
-
- # for now, only pdf ingest is implemented
- if not 'ingest_type' in request:
- request['ingest_type'] = "pdf"
- assert request.get('ingest_type') == "pdf"
- ingest_type = request.get('ingest_type')
-
- # parse/clean URL
- # note that we pass through the original/raw URL, and that is what gets
- # persisted in database table
- base_url = clean_url(request['base_url'])
-
- force_recrawl = bool(request.get('force_recrawl', False))
-
- for block in self.base_url_blocklist:
- if block in base_url:
- print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
- return dict(request=request, hit=False, status="skip-url-blocklist")
-
- print("[INGEST {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
-
- best_mimetype = None
- if ingest_type == "pdf":
- best_mimetype = "application/pdf"
-
- existing = self.check_existing_ingest(base_url)
- if existing:
- return self.process_existing(request, existing)
-
- result = dict(request=request, hit=False)
-
- next_url = base_url
- hops = [base_url]
- self.max_hops = 6
-
-
- while len(hops) <= self.max_hops:
-
- result['hops'] = hops
- try:
- resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
- except SavePageNowError as e:
- result['status'] = 'spn2-error'
- result['error_message'] = str(e)[:1600]
- return result
- except PetaboxError as e:
- result['status'] = 'petabox-error'
- result['error_message'] = str(e)[:1600]
- return result
- except CdxApiError as e:
- result['status'] = 'cdx-error'
- result['error_message'] = str(e)[:1600]
- return result
- except WaybackError as e:
- result['status'] = 'wayback-error'
- result['error_message'] = str(e)[:1600]
- return result
- except NotImplementedError as e:
- result['status'] = 'not-implemented'
- result['error_message'] = str(e)[:1600]
- return result
-
- if not resource.hit:
- result['status'] = resource.status
- if resource.terminal_dt and resource.terminal_status_code:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
- if resource.terminal_url not in result['hops']:
- result['hops'].append(resource.terminal_url)
- return result
-
- if not resource.body:
- result['status'] = 'null-body'
- return result
- file_meta = gen_file_metadata(resource.body)
-
- # crude handling of content-encoding; wayback fetch library usually
- # (and should always?) handle this
- if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
- print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
- try:
- inner_body = gzip.decompress(resource.body)
- except Exception as e:
- result['status'] = 'bad-gzip-encoding'
- result['error_message'] = str(e)
- return result
- if not inner_body:
- result['status'] = 'null-body'
- return result
- resource = ResourceResult(
- body=inner_body,
- # copy all other fields
- start_url=resource.start_url,
- hit=resource.hit,
- status=resource.status,
- terminal_url=resource.terminal_url,
- terminal_dt=resource.terminal_dt,
- terminal_status_code=resource.terminal_status_code,
- cdx=resource.cdx,
- revisit_cdx=resource.revisit_cdx,
- )
- file_meta = gen_file_metadata(resource.body)
-
- if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']:
- # Got landing page or similar. Some XHTML detected as "application/xml"
- if resource.terminal_dt:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
- fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
- result['html'] = fulltext_url
- if not fulltext_url:
- result['status'] = 'no-pdf-link'
- return result
- next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
- assert next_url
- next_url = clean_url(next_url)
- print("[PARSE\t] {}\t{}".format(
- fulltext_url.get('technique'),
- next_url,
- ),
- file=sys.stderr)
- if next_url in hops:
- result['status'] = 'link-loop'
- result['error_message'] = "repeated: {}".format(next_url)
- return result
- hops.append(next_url)
- continue
-
- # default is to NOT keep hopping
- break
-
- if len(hops) >= self.max_hops:
- result['status'] = "max-hops-exceeded"
- return result
-
- if resource.terminal_dt:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- "terminal_sha1hex": file_meta['sha1hex'],
- }
-
- # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
- assert resource.hit == True
- assert resource.terminal_status_code in (200, 226)
-
- result['file_meta'] = file_meta
- result['cdx'] = cdx_to_dict(resource.cdx)
- if resource.revisit_cdx:
- result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
-
- # other failure cases
- if not resource.body or file_meta['size_bytes'] == 0:
- result['status'] = 'null-body'
- return result
-
- if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
- result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
- return result
-
- info = self.process_hit(resource, file_meta)
- result.update(info)
-
- result['status'] = "success"
- result['hit'] = True
- print("[SUCCESS\t] sha1:{} grobid:{}".format(
- result.get('file_meta', {}).get('sha1hex'),
- result.get('grobid', {}).get('status_code'),
- ),
- file=sys.stderr)
- return result
-
-
-class IngestFileRequestHandler(BaseHTTPRequestHandler):
- def do_POST(self):
- if self.path != "/ingest":
- self.send_response(404)
- self.end_headers()
- self.wfile.write("404: Not Found")
- return
- length = int(self.headers.get('content-length'))
- request = json.loads(self.rfile.read(length).decode('utf-8'))
- print("Got request: {}".format(request))
- ingester = IngestFileWorker()
- result = ingester.process(request)
- self.send_response(200)
- self.end_headers()
- self.wfile.write(json.dumps(result))
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
new file mode 100644
index 0000000..03277f8
--- /dev/null
+++ b/python/sandcrawler/ingest_file.py
@@ -0,0 +1,925 @@
+import json
+import sys
+import time
+import xml.etree.ElementTree
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Dict, List, Optional
+
+from selectolax.parser import HTMLParser
+
+from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import (
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiError,
+ NoCaptureError,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_html import (
+ WebResource,
+ fetch_html_resources,
+ html_extract_body_teixml,
+ html_guess_platform,
+ html_guess_scope,
+ quick_fetch_html_resources,
+)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.xml import xml_reserialize
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFileWorker(SandcrawlerWorker):
+ """
+ High level flow is to look in history first, then go to live web if
+ resource not found. Following redirects is treated as "fetching a
+ resource". Current version fetches a single resource; if it isn't a hit
+ but is an HTML 200, treats it as a landing page, tries to extract
+ fulltext link, then fetches that resource.
+
+ process(request, key=None) -> response
+ Does all the things!
+
+ Check existing processing (short circuit):
+
+ check_existing_ingest(base_url) -> ingest_file_result or none
+ process_existing(result) -> response
+ try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()
+
+ Fetch resource:
+
+ find_resource(url) -> ResourceResult
+
+ Process resource:
+
+ process_file_hit(ResourceResult) -> response
+ process_grobid(ResourceResult)
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__()
+
+ self.sink = sink
+
+ if kwargs.get("wayback_client"):
+ self.wayback_client: WaybackClient = kwargs["wayback_client"]
+ else:
+ self.wayback_client = WaybackClient()
+
+ if kwargs.get("spn_client"):
+ self.spn_client: SavePageNowClient = kwargs["spn_client"]
+ else:
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+
+ if kwargs.get("grobid_client"):
+ self.grobid_client: GrobidClient = kwargs["grobid_client"]
+ else:
+ self.grobid_client = GrobidClient()
+
+ if kwargs.get("pgrest_client"):
+ self.pgrest_client: SandcrawlerPostgrestClient = kwargs["pgrest_client"]
+ else:
+ self.pgrest_client = SandcrawlerPostgrestClient()
+
+ self.grobid_sink = kwargs.get("grobid_sink")
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
+ self.pdftext_sink = kwargs.get("pdftext_sink")
+ self.xmldoc_sink = kwargs.get("xmldoc_sink")
+ self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
+ self.max_hops = 8
+
+ self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
+ self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
+ self.try_existing_pdfextract = kwargs.get("try_existing_pdfextract", True)
+ self.try_wayback = kwargs.get("try_wayback", True)
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.html_quick_mode = kwargs.get("html_quick_mode", False)
+ self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
+
+ self.base_url_blocklist = [
+ "://localhost/",
+ "://127.0.0.1/",
+ # robot blocking / rate-limited
+ "://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
+ # temporary, until we implement specific fetch and 'petabox' output
+ "://archive.org/",
+ "://www.archive.org/",
+ "://web.archive.org/web/",
+ # out of scope
+ "://openlibrary.org/",
+ "://www.openlibrary.org/",
+ "://fatcat.wiki/",
+ "://scholar.archive.org/",
+ "://orcid.org/",
+ # Domain squats
+ "://bartandjones.com",
+ "://ijretm.com",
+ "://ijrcemas.com",
+ "://jist.net.in",
+ "://croisements-revue.org",
+ # all stubs/previews, not full papers
+ "://page-one.live.cf.public.springer.com",
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+ "://doi.org/10.25642/ipk/gbis/",
+ "://apex.ipk-gatersleben.de/",
+ "fao.org/glis/",
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
+ # DOI prefixes
+ "doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.18730/", # fao.org: database entry
+ "doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
+ # deprecated domain (doesn't redirect correctly)
+ "://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
+ ]
+
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
+ "/password-login",
+ "://gateway.isiknowledge.com/",
+ "/login?TARGET=",
+ "jstage.jst.go.jp/sblogin",
+ "://acw.elsevier.com/SSOCore",
+ "://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
+ ]
+
+ self.cookie_blocklist = [
+ "/cookieAbsent",
+ "cookieSet=1",
+ "error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
+ ]
+
+ self.src_valid_mimetypes = [
+ "text/x-tex",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip",
+ "application/x-tar",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ]
+
+ self.component_valid_mimetypes = [
+ "image/jpeg",
+ "image/tiff",
+ "image/png",
+ "image/gif",
+ "audio/mpeg",
+ "video/mp4",
+ "video/mpeg",
+ "text/plain",
+ "text/csv",
+ "text/x-r-source", # dataverse
+ "text/tab-separated-values", # dataverse
+ "text/x-rst", # dataverse
+ "application/x-rlang-transport", # dataverse
+ "application/json",
+ "application/xml",
+ "application/pdf",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip ",
+ "application/x-rar ",
+ "application/x-7z-compressed",
+ "application/x-tar",
+ "application/vnd.ms-powerpoint",
+ "application/vnd.ms-excel",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ]
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Check in sandcrawler-db (postgres) to see if we have already ingested
+ this URL (ingest file result table).
+
+ Returns existing row *if* found *and* we should use it, otherwise None.
+
+ Looks at existing ingest results and makes a decision based on, eg,
+ status and timestamp.
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def find_resource(
+ self, url: str, best_mimetype: Optional[str] = None, force_recrawl: bool = False
+ ) -> Optional[ResourceResult]:
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. If a hit isn't found, try crawling with SPN.
+ """
+ via = "none"
+ resource = None
+
+ if url.startswith("http://web.archive.org/web/") or url.startswith(
+ "https://web.archive.org/web/"
+ ):
+ raise NotImplementedError("handling direct wayback links not supported yet")
+
+ if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
+ raise NotImplementedError("fetching from archive.org not implemented yet")
+
+ if self.try_wayback and not force_recrawl:
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(url, best_mimetype)
+
+ # check for "soft 404" conditions, where we should retry with live SPNv2
+ soft404 = False
+ # NOTE: these are often not working with SPNv2 either, so disabling. If
+ # we really want to try again, should do force-recrawl
+ # if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
+ # soft404 = True
+
+ old_failure = False
+ if (
+ resource
+ and not resource.hit
+ and resource.terminal_dt
+ and resource.terminal_dt < "20190000000000"
+ ):
+ old_failure = True
+
+ if self.try_spn2 and (
+ resource is None
+ or (resource and resource.status == "no-capture")
+ or soft404
+ or old_failure
+ ):
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(url, self.wayback_client)
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via, (resource and resource.status), (resource and resource.terminal_url) or url
+ ),
+ file=sys.stderr,
+ )
+ return resource
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest file result, do any database fetches or
+ additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+ assert result_row["hit"]
+ existing_file_meta = self.pgrest_client.get_file_meta(result_row["terminal_sha1hex"])
+ existing_grobid = self.pgrest_client.get_grobid(result_row["terminal_sha1hex"])
+ existing_cdx = self.pgrest_client.get_cdx(
+ result_row["terminal_url"], result_row["terminal_dt"]
+ )
+ if not (existing_file_meta and existing_grobid and existing_cdx):
+ raise NotImplementedError("partially-exsiting records not implemented yet")
+ result = {
+ "hit": result_row["hit"],
+ "status": "existing",
+ "request": request,
+ "grobid": existing_grobid,
+ "file_meta": existing_file_meta,
+ "cdx": existing_cdx,
+ "terminal": {
+ "terminal_url": result_row["terminal_url"],
+ "terminal_dt": result_row["terminal_dt"],
+ "terminal_status_code": result_row["terminal_status_code"],
+ "terminal_sha1hex": result_row["terminal_sha1hex"],
+ },
+ }
+ return result
+
+ def process_file_hit(
+ self, ingest_type: str, resource: ResourceResult, file_meta: dict
+ ) -> dict:
+ """
+ Run all the necessary processing for a new/fresh ingest hit.
+ """
+ if (
+ ingest_type in ["dataset-file", "component"]
+ and file_meta["mimetype"] == "application/pdf"
+ ):
+ ingest_type = "pdf"
+ if ingest_type == "pdf":
+ return {
+ "grobid": self.process_grobid(resource, file_meta),
+ "pdf_meta": self.process_pdfextract(resource, file_meta),
+ }
+ elif ingest_type == "xml":
+ return {
+ "xml_meta": self.process_xml(resource, file_meta),
+ }
+ elif ingest_type == "html":
+ html_info = self.process_html(resource, file_meta)
+ # if there is no html_biblio, don't clobber anything possibly extracted earlier
+ if "html_biblio" in html_info and not html_info["html_biblio"]:
+ html_info.pop("html_biblio")
+ return html_info
+ elif ingest_type == "src":
+ return {}
+ elif ingest_type == "component":
+ return {}
+ elif ingest_type == "dataset-file":
+ return {}
+ else:
+ raise NotImplementedError(f"process {ingest_type} hit")
+
+ def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Submits to resource body to GROBID for processing.
+
+ TODO: By default checks sandcrawler-db for an existing row first, then
+ decide if we should re-process
+ """
+ if self.try_existing_grobid:
+ existing = self.pgrest_client.get_grobid(file_meta["sha1hex"])
+ if existing:
+ # grobid_timestamp = existing.get("grobid_timestamp") or None
+ # status
+ grobid_version = existing.get("grobid_version") or None
+ if grobid_version and grobid_version.startswith("0.7"):
+ print("found existing GROBID result", file=sys.stderr)
+ return existing
+
+ # Need to actually processes
+ result = self.grobid_client.process_fulltext(resource.body)
+ if self.grobid_sink:
+ # extra fields for GROBID kafka messages
+ result["file_meta"] = file_meta
+ result["key"] = result["file_meta"]["sha1hex"]
+ self.grobid_sink.push_record(result.copy())
+ if result["status"] == "success":
+ metadata = self.grobid_client.metadata(result)
+ if metadata:
+ result["metadata"] = metadata
+ result["fatcat_release"] = metadata.pop("fatcat_release", None)
+ result["grobid_version"] = metadata.pop("grobid_version", None)
+ result.pop("tei_xml", None)
+ result.pop("file_meta", None)
+ result.pop("key", None)
+ return result
+
+ def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Extracts thumbnail and pdf_meta info from PDF.
+
+ By default checks sandcrawler-db for an existing row first, then decide
+ if we should re-process.
+
+ TODO: difference between Kafka schema and SQL/postgrest schema
+ """
+ if self.try_existing_pdfextract:
+ existing = self.pgrest_client.get_pdf_meta(file_meta["sha1hex"])
+ if existing:
+ print("found existing pdf_meta result", file=sys.stderr)
+ result = PdfExtractResult.from_pdf_meta_dict(existing)
+ return result.to_pdftext_dict()
+
+ # Need to actually processes
+ result = process_pdf(resource.body)
+ assert result.sha1hex == file_meta["sha1hex"]
+ assert result.file_meta is not None
+ assert result.file_meta["sha1hex"] == file_meta["sha1hex"]
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ if self.pdftext_sink:
+ self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
+ result.page0_thumbnail = None
+ result.text = None
+ result.file_meta = None
+ return result.to_pdftext_dict()
+
+ def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Simply publishes to Kafka topic.
+
+ In the future, could extract other metadata here (like body word
+ count), or attempting to fetch sub-resources.
+ """
+ if self.xmldoc_sink and file_meta["mimetype"] == "application/jats+xml":
+ try:
+ jats_xml = xml_reserialize(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="xml-parse-error")
+ msg = dict(
+ sha1hex=file_meta["sha1hex"],
+ status="success",
+ jats_xml=jats_xml,
+ )
+ self.xmldoc_sink.push_record(msg, key=file_meta["sha1hex"])
+ return dict(status="success")
+
+ def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
+
+ assert resource.body
+ try:
+ html_doc = HTMLParser(resource.body)
+ except ValueError:
+ return dict(status="html-selectolax-error")
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
+ html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
+ html_scope = html_guess_scope(
+ resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
+ )
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
+
+ if html_scope in ("blocked-captcha", "blocked-cookie", "blocked-forbidden"):
+ return dict(
+ status=html_scope,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ )
+ elif html_scope not in (
+ "article-fulltext",
+ "unknown",
+ ):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="wrong-scope",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ raw_resources = html_extract_resources(
+ resource.terminal_url, html_doc, self.adblock_rules
+ )
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ if self.htmlteixml_sink and html_body["status"] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta["sha1hex"])
+
+ html_body.pop("tei_xml", None)
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ when = parse_cdx_datetime(resource.cdx.datetime)
+ full_resources: List[WebResource] = []
+
+ try:
+ if self.html_quick_mode:
+ print(" WARN: running quick CDX-only fetches", file=sys.stderr)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, self.wayback_client.cdx_client, when
+ )
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result["status"] = "petabox-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result["status"] = "cdx-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result["status"] = "wayback-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result["status"] = "wayback-content-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except NoCaptureError as e:
+ partial_result["status"] = "html-resource-no-capture"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+
+ info = dict(
+ html_body=html_body,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
+ )
+ if html_scope == "unknown":
+ info["status"] = "unknown-scope"
+ return info
+
+ def timeout_response(self, task: dict) -> dict:
+ print("[TIMEOUT]", file=sys.stderr)
+ return dict(
+ request=task,
+ hit=False,
+ status="timeout",
+ error_message="ingest worker internal timeout",
+ )
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("file", "pdf", "xml", "html", "src", "component"):
+ return False
+ return True
+
+ def process(self, request: dict, key: Any = None) -> dict:
+ return self.process_file(request, key=key)
+
+ def process_file(self, request: dict, key: Any = None) -> dict:
+
+ # old backwards compatibility
+ if request.get("ingest_type") == "file":
+ request["ingest_type"] = "pdf"
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("pdf", "xml", "html", "src", "component"):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ for block in self.base_url_blocklist:
+ if block in base_url:
+ print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+ return dict(request=request, hit=False, status="skip-url-blocklist")
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ best_mimetype = None
+ if ingest_type == "pdf":
+ best_mimetype = "application/pdf"
+ elif ingest_type == "xml":
+ best_mimetype = "text/xml"
+ elif ingest_type == "html":
+ best_mimetype = "text/html"
+ elif ingest_type == "src":
+ best_mimetype = "application/gzip"
+
+ existing = self.check_existing_ingest(ingest_type, base_url)
+ if existing:
+ return self.process_existing(request, existing)
+
+ result: Dict[str, Any] = dict(request=request, hit=False)
+
+ next_url = base_url
+ hops = [base_url]
+
+ while len(hops) <= self.max_hops:
+
+ result["hops"] = hops
+
+ # check against blocklist again on each hop
+ for block in self.base_url_blocklist:
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ # also check against known loginwall patterns
+ for block in self.wall_blocklist:
+ if block in next_url:
+ # TODO: blocked-wall instead of skip-wall
+ result["status"] = "skip-wall"
+ return result
+
+ # check for popular cookie blocking URL patterns. On successful SPN
+ # crawls, shouldn't see these redirect URLs
+ for pattern in self.cookie_blocklist:
+ if pattern in next_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ try:
+ resource = self.find_resource(
+ next_url, best_mimetype, force_recrawl=force_recrawl
+ )
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ assert resource
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ if ingest_type == "pdf" and html_ish_resource:
+
+ # the new style of URL extraction (already computed)
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
+
+ result["extract_next_hop"] = fulltext_url
+ if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
+ result["status"] = "no-pdf-link"
+ return result
+ next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
+ assert next_url
+ next_url = clean_url(next_url)
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ fulltext_url.get("technique"),
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif (
+ ingest_type in ("xml", "html", "component")
+ and html_ish_resource
+ and html_biblio
+ ):
+ # NOTE: src_fulltext_url is not a thing
+ next_url_found = None
+ if ingest_type == "xml" and html_biblio.xml_fulltext_url:
+ next_url_found = html_biblio.xml_fulltext_url
+ elif ingest_type == "html" and html_biblio.html_fulltext_url:
+ next_url_found = html_biblio.html_fulltext_url
+ elif ingest_type == "component" and html_biblio.component_url:
+ next_url_found = html_biblio.component_url
+
+ if next_url_found:
+ next_url = next_url_found
+ technique = "html_biblio"
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ if ingest_type == "html":
+ # for HTML ingest, we don't count this as a link-loop
+ break
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result["status"] = "max-hops-exceeded"
+ return result
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "src":
+ if file_meta["mimetype"] not in self.src_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "component":
+ if file_meta["mimetype"] not in self.component_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ raise NotImplementedError()
+
+ info = self.process_file_hit(ingest_type, resource, file_meta)
+ result.update(info)
+
+ # check if processing turned up an error
+ if info.get("status") not in ("success", None):
+ result["status"] = info["status"]
+ return result
+
+ result["status"] = "success"
+ result["hit"] = True
+ if ingest_type == "pdf":
+ print(
+ "[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ result.get("grobid", {}).get("status_code"),
+ result.get("pdf_meta", {}).get("status"),
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[SUCCESS {:>5}] sha1:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ ),
+ file=sys.stderr,
+ )
+ return result
+
+
+class IngestFileRequestHandler(BaseHTTPRequestHandler):
+ def do_POST(self) -> None:
+ if self.path != "/ingest":
+ self.send_response(404)
+ self.end_headers()
+ self.wfile.write(b"404: Not Found")
+ return
+ length = int(self.headers.get("content-length"))
+ request = json.loads(self.rfile.read(length).decode("utf-8"))
+ print("Got request: {}".format(request))
+ ingester = IngestFileWorker()
+ result = ingester.process(request)
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(json.dumps(result).encode("utf8"))
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
new file mode 100644
index 0000000..3acbece
--- /dev/null
+++ b/python/sandcrawler/ingest_fileset.py
@@ -0,0 +1,516 @@
+import json
+import sys
+import time
+from typing import Any, Dict, Optional
+
+import requests
+from selectolax.parser import HTMLParser
+
+from sandcrawler.fileset_platforms import (
+ ArchiveOrgHelper,
+ DataverseHelper,
+ FigshareHelper,
+ ZenodoHelper,
+)
+from sandcrawler.fileset_strategies import (
+ ArchiveorgFilesetStrategy,
+ ArchiveorgFileStrategy,
+ WebFilesetStrategy,
+ WebFileStrategy,
+)
+from sandcrawler.fileset_types import (
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (
+ CdxApiError,
+ PetaboxError,
+ SavePageNowError,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_file import IngestFileWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
+from sandcrawler.workers import SandcrawlerWorker
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFilesetWorker(IngestFileWorker):
+ """
+ General process is:
+
+ 1. crawl base_url, and use request and landing page resource (eg, HTML) to
+ determine platform being targeted
+ 2. use platform-specific helper to fetch metadata about the work, including
+ a manifest of files, and selection of an "ingest strategy" and any
+ required context
+ 3. then use strategy-specific helper to archive files from manifest (first
+ checking to see if content has been archived already)
+ 4. summarize status
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__(sink=None, **kwargs)
+
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.sink = sink
+ self.dataset_platform_helpers = {
+ "dataverse": DataverseHelper(),
+ "figshare": FigshareHelper(),
+ "zenodo": ZenodoHelper(),
+ "archiveorg": ArchiveOrgHelper(),
+ }
+ self.dataset_strategy_archivers = {
+ IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+ IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(try_spn2=self.try_spn2),
+ IngestStrategy.WebFile: WebFileStrategy(try_spn2=self.try_spn2),
+ }
+
+ self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024)
+ self.max_file_count = kwargs.get("max_file_count", 200)
+ self.ingest_file_result_sink = kwargs.get("ingest_file_result_sink")
+ self.ingest_file_result_stdout = kwargs.get("ingest_file_result_stdout", False)
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Same as file version, but uses fileset result table
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_fileset_platform(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest fileset result, do any database fetches
+ or additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("dataset",):
+ return False
+ return True
+
+ def fetch_resource_iteratively(
+ self, ingest_type: str, base_url: str, force_recrawl: bool
+ ) -> dict:
+ """
+ This is copypasta from process_file(), should probably refactor.
+ """
+
+ result: Dict[str, Any] = dict(hit=False)
+ result["hops"] = [base_url]
+ next_url = base_url
+
+ # check against blocklist
+ for block in self.base_url_blocklist:
+ # NOTE: hack to not skip archive.org content
+ if "archive.org" in block:
+ continue
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ try:
+ resource = self.find_resource(next_url, force_recrawl=force_recrawl)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ html_biblio = None
+ if resource:
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ # eg, datasets, components, etc
+ pass
+
+ result["_html_biblio"] = html_biblio
+ result["_resource"] = resource
+ return result
+
+ def process(self, request: dict, key: Any = None) -> dict:
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("dataset",):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ # TODO: "existing" check against file and/or fileset ingest result table
+ # existing = self.check_existing_ingest(ingest_type, base_url)
+ # if existing:
+ # return self.process_existing(request, existing)
+
+ result = self.fetch_resource_iteratively(
+ ingest_type, base_url, force_recrawl=force_recrawl
+ )
+ result["request"] = request
+ if result.get("status") is not None:
+ result["request"] = request
+ return result
+
+ html_biblio = result.pop("_html_biblio")
+ resource = result.pop("_resource")
+
+ # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+ # TODO: could involve html_guess_platform() here?
+
+ # determine platform
+ platform_helper = None
+ for (helper_name, helper) in self.dataset_platform_helpers.items():
+ if helper.match_request(request, resource, html_biblio):
+ platform_helper = helper
+ break
+
+ if not platform_helper:
+ result["status"] = "no-platform-match"
+ return result
+
+ # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+ try:
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ except PlatformScopeError as e:
+ result["status"] = "platform-scope"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PlatformRestrictedError as e:
+ result["status"] = "platform-restricted"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except requests.exceptions.HTTPError as e:
+ result["error_message"] = str(e)[:1600]
+ if e.response.status_code == 404:
+ result["status"] = "platform-404"
+ result["error_message"] = str(e)[:1600]
+ return result
+ else:
+ result["status"] = "platform-http-error"
+ return result
+ except requests.exceptions.RequestException as e:
+ result["error_message"] = str(e)[:1600]
+ result["status"] = "platform-error"
+ return result
+
+ # print(dataset_meta, file=sys.stderr)
+ platform = dataset_meta.platform_name
+ result["platform_name"] = dataset_meta.platform_name
+ result["platform_domain"] = dataset_meta.platform_domain
+ result["platform_id"] = dataset_meta.platform_id
+ result["platform_base_url"] = dataset_meta.web_base_url
+ result["archiveorg_item_name"] = dataset_meta.archiveorg_item_name
+
+ if not dataset_meta.manifest:
+ result["status"] = "empty-manifest"
+ return result
+
+ # these will get confirmed/updated after ingest
+ result["manifest"] = [m.dict(exclude_none=True) for m in dataset_meta.manifest]
+ result["file_count"] = len(dataset_meta.manifest)
+ result["total_size"] = sum([m.size for m in dataset_meta.manifest if m.size])
+
+ if result["total_size"] > self.max_total_size:
+ result["status"] = "too-large-size"
+ return result
+ if result["file_count"] > self.max_file_count:
+ # hard max, to prevent downstream breakage
+ if result["file_count"] > 10 * 1000:
+ result["manifest"] = result["manifest"][: self.max_file_count]
+ result["status"] = "too-many-files"
+ return result
+
+ ingest_strategy = platform_helper.chose_strategy(dataset_meta)
+ result["ingest_strategy"] = ingest_strategy
+ print(
+ f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}",
+ file=sys.stderr,
+ )
+
+ strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
+ if not strategy_helper:
+ result["status"] = "no-strategy-helper"
+ return result
+
+ # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+ try:
+ archive_result = strategy_helper.process(dataset_meta)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ # 4. Summarize status and return structured result metadata.
+ result["status"] = archive_result.status
+ result["manifest"] = [m.dict(exclude_none=True) for m in archive_result.manifest]
+
+ if ingest_strategy.endswith("-fileset-bundle"):
+ result["fileset_bundle"] = dict()
+ if archive_result.bundle_file_meta:
+ result["fileset_bundle"]["file_meta"] = archive_result.bundle_file_meta
+ if archive_result.bundle_archiveorg_path:
+ result["fileset_bundle"][
+ "archiveorg_bundle_path"
+ ] = archive_result.bundle_archiveorg_path
+ if archive_result.bundle_resource:
+ result["fileset_bundle"]["terminal"] = dict(
+ terminal_url=archive_result.bundle_resource.terminal_url,
+ terminal_dt=archive_result.bundle_resource.terminal_dt,
+ terminal_status_code=archive_result.bundle_resource.terminal_status_code,
+ )
+ if archive_result.bundle_resource.cdx:
+ result["fileset_bundle"]["cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.cdx
+ )
+ if archive_result.bundle_resource.revisit_cdx:
+ result["fileset_bundle"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.revisit_cdx
+ )
+
+ if ingest_strategy.endswith("-file"):
+ result["fileset_file"] = dict()
+ if archive_result.file_file_meta:
+ result["fileset_file"]["file_meta"] = (archive_result.file_file_meta,)
+ if archive_result.file_resource:
+ result["fileset_file"]["terminal"] = dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ )
+ if archive_result.file_resource.cdx:
+ result["fileset_file"]["cdx"] = cdx_to_dict(
+ archive_result.file_resource.cdx
+ )
+ if archive_result.file_resource.revisit_cdx:
+ result["fileset_file"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+
+ if result["status"].startswith("success"):
+ # check that these are still valid
+ assert result["file_count"] == len(archive_result.manifest)
+ assert result["total_size"] == sum(
+ [m.size for m in archive_result.manifest if m.size]
+ )
+
+ if (
+ result["status"] == "success-file"
+ and archive_result.file_resource
+ and archive_result.file_file_meta
+ ):
+ file_result: Dict[str, Any] = dict(
+ hit=True,
+ status="success",
+ request=request.copy(),
+ file_meta=archive_result.file_file_meta,
+ terminal=dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ terminal_sha1hex=archive_result.file_file_meta["sha1hex"],
+ ),
+ )
+ if archive_result.file_resource.cdx:
+ file_result["cdx"] = cdx_to_dict(archive_result.file_resource.cdx)
+ if archive_result.file_resource.revisit_cdx:
+ file_result["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+ file_result["request"]["ingest_type"] = request["ingest_type"] + "-file"
+ # call the super() (ingest_file) version of process_hit()
+ info = self.process_file_hit(
+ file_result["request"]["ingest_type"],
+ archive_result.file_resource,
+ archive_result.file_file_meta,
+ )
+ file_result.update(info)
+ if self.ingest_file_result_sink:
+ self.ingest_file_result_sink.push_record(result.copy())
+ elif self.ingest_file_result_stdout:
+ sys.stdout.write(json.dumps(file_result, sort_keys=True) + "\n")
+
+ if result["status"].startswith("success"):
+ result["hit"] = True
+ print(
+ "[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["status"],
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ return result
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
new file mode 100644
index 0000000..fb42e71
--- /dev/null
+++ b/python/sandcrawler/ingest_html.py
@@ -0,0 +1,499 @@
+import argparse
+import datetime
+import json
+import sys
+import xml.etree.ElementTree as ET
+from typing import Any, List, Optional, Tuple
+
+import pydantic
+import trafilatura
+from selectolax.parser import HTMLParser
+
+from sandcrawler.html_metadata import (
+ BiblioMetadata,
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiClient,
+ NoCaptureError,
+ WaybackClient,
+ WaybackContentError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.misc import (
+ datetime_to_cdx,
+ gen_file_metadata,
+ parse_cdx_datetime,
+ url_fuzzy_equal,
+)
+
+TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
+
+def html_extract_body_teixml(doc: bytes) -> dict:
+ try:
+ tei_xml = trafilatura.extract(
+ doc,
+ output_format="xmltei",
+ include_comments=False,
+ include_formatting=True,
+ )
+ except (ValueError, TypeError, Exception) as e:
+ return dict(
+ status="trafilatura-parse-error",
+ error_msg=str(e)[:1000],
+ )
+ if tei_xml:
+ body_txt = teixml_body_text(tei_xml)
+ word_count = len(body_txt.split())
+ return dict(
+ status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count
+ )
+ elif doc.startswith(
+ b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+ ):
+ # hack for firstmonday.org
+ return html_extract_body_teixml(doc[106:])
+ else:
+ return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
+
+
+def teixml_body_text(doc_xml: str) -> str:
+ ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+ tree = ET.fromstring(doc_xml)
+ body = tree.find(".//tei:body", ns)
+ if body:
+ return " ".join(body.itertext())
+ else:
+ return ""
+
+
+class WebResource(pydantic.BaseModel):
+ surt: str
+ timestamp: datetime.datetime
+ url: str
+ sha1hex: str
+ mimetype: str
+ status_code: int
+ size: Optional[int]
+ sha256hex: Optional[str]
+ resource_type: Optional[str]
+
+ class Config:
+ json_encoders = {datetime.datetime: lambda dt: dt.isoformat()}
+
+
+class IngestWebResult(pydantic.BaseModel):
+ status: str
+ hit: bool
+ error_message: Optional[str]
+ cdx: Optional[dict]
+ terminal: Optional[Any] # TODO
+ request: Optional[Any] # TODO
+ file_meta: Optional[dict]
+ html_biblio: Optional[BiblioMetadata]
+ scope: Optional[str]
+ html_body: Optional[dict]
+ html_resources: Optional[List[WebResource]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+
+class HtmlMetaRow(pydantic.BaseModel):
+ sha1hex: str
+ status: str
+ scope: Optional[str]
+ has_teixml: bool
+ has_thumbnail: bool
+ word_count: Optional[int]
+ biblio: Optional[dict]
+ resources: Optional[List[dict]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+ def to_sql_tuple(self) -> Tuple:
+ """
+ This is for the html_meta SQL table.
+ """
+ return (
+ self.sha1hex,
+ datetime.datetime.now(), # updated
+ self.status,
+ self.scope,
+ self.has_teixml,
+ self.has_thumbnail,
+ self.word_count,
+ (self.biblio or None) and json.dumps(self.biblio, sort_keys=True),
+ (self.resources or None) and json.dumps(self.resources, sort_keys=True),
+ )
+
+
+def quick_fetch_html_resources(
+ resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
+ """
+ This is the lazy version that just does a CDX lookup for each resource.
+
+ Takes a list instead of single record because we may want to circuit break
+ on failure, and may introduce concurrency internal to this function.
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ cdx_row = cdx_client.lookup_best(resource["url"], closest=closest)
+ if not cdx_row:
+ raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
+ if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
+ print(
+ f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
+ )
+ if not cdx_row.status_code:
+ # TODO: fall back to a full fetch?
+ print(" WARN: skipping revisit record", file=sys.stderr)
+ continue
+ full.append(
+ WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource["type"],
+ )
+ )
+
+ return full
+
+
+def fetch_html_resources(
+ resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
+ """
+ This is the full version which fetches each resource from wayback/petabox
+ and calculates additional hashes.
+
+ Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
+ if not wayback_resp or wayback_resp.status != "success":
+ raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
+ # for HTML sub-resources specifically, we allow the CDX SHA1 to match
+ # either the transfer-encoded or inner (un-encoded) payload body to
+ # match. This is because of an ambiguity in the WARC specification
+ outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
+ if (
+ file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ ):
+ raise WaybackContentError(
+ f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}"
+ )
+ full.append(
+ WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta["sha1hex"],
+ mimetype=file_meta["mimetype"],
+ status_code=wayback_resp.cdx.status_code
+ or wayback_resp.revisit_cdx.status_code,
+ size=file_meta["size_bytes"],
+ sha256hex=file_meta["sha256hex"],
+ resource_type=resource["type"],
+ )
+ )
+
+ return full
+
+
+def html_guess_platform(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
+) -> Optional[str]:
+
+ generator: Optional[str] = None
+ generator_elem = doc.css_first("meta[name='generator']")
+ if generator_elem:
+ generator = generator_elem.attrs["content"]
+ else:
+ generator_elem = doc.css_first("a[id='developedBy']")
+ if generator_elem:
+ generator = generator_elem.text()
+ if generator and "open journal systems 3" in generator.lower():
+ return "ojs3"
+ elif generator and "open journal systems" in generator.lower():
+ return "ojs"
+ elif generator and "plone" in generator.lower():
+ return "plone"
+ elif generator and "wordpress" in generator.lower():
+ return "wordpress"
+ elif generator and "blogger" in generator.lower():
+ return "blogger"
+ elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
+ return "ojs"
+ else:
+ try:
+ if (
+ 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
+ in doc.html
+ ):
+ return "ojs"
+ if '<a href="https://www.pubpub.org">Published with' in doc.html:
+ return "pubpub"
+ if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
+ return "arpha"
+ if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
+ return "galenos"
+ except UnicodeDecodeError:
+ pass
+
+ icon_elem = doc.css_first("link[type='image/x-icon']")
+ if icon_elem and "href" in icon_elem.attrs:
+ if "journalssystem.com" in icon_elem.attrs["href"]:
+ return "journalssystem.com"
+ elif "indexcopernicus.com" in icon_elem.attrs["href"]:
+ return "indexcopernicus"
+
+ if "scielo" in url:
+ return "scielo"
+
+ return None
+
+
+def html_guess_scope(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]
+) -> str:
+ """
+ This function tries to guess if an HTML document represents one of:
+
+ - article-fulltext
+ - article-abstract
+ - article-sample
+ - supplement
+ - component
+ - issue-fulltext
+ - landingpage
+ - homepage-domain
+ - blocked-paywall
+ - blocked-login
+ - blocked-captcha
+ - blocked-cookie
+ - errorpage
+ - stub
+ - other
+ - unknown
+
+ Unknown implies the page could be anything. "other" implies it is not
+ fulltext or a landing page, but could be one of the other categories.
+ """
+
+ # assert that this is a real URL
+ assert url.count("/") >= 2
+
+ # basic paywall and loginwall detection based on URL
+ if url.endswith("/cookieAbsent"):
+ return "blocked-cookie"
+ if "://page-one.live.cf.public.springer.com" in url:
+ return "article-sample"
+
+ if "scielo" in url:
+ if "sci_abstract" in url:
+ return "landingpage"
+ if "sci_arttext" in url:
+ return "article-fulltext"
+
+ if "showcaptcha.asp" in url:
+ return "blocked-captcha"
+
+ # is this the top-level URL of the domain? aka, no path?
+ if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"):
+ return "homepage-domain"
+
+ platform = html_guess_platform(url, doc, biblio)
+
+ if biblio:
+ if biblio.html_fulltext_url:
+ if url_fuzzy_equal(biblio.html_fulltext_url, url):
+ return "article-fulltext"
+ else:
+ return "landingpage"
+
+ # platform-specific detection
+ if platform in ("ojs", "ojs3"):
+
+ if biblio and biblio.title:
+ if word_count and word_count > 1200:
+ return "fulltext"
+ else:
+ return "landingpage"
+ else:
+ if "/article/view/" in url and word_count and word_count > 600:
+ return "fulltext"
+ return "other"
+ elif platform == "journalssystem.com":
+ if biblio and biblio.pdf_fulltext_url and word_count and word_count < 1000:
+ return "landingpage"
+
+ # more platform/publisher specific checks
+ if "karger.com/Article/Abstract" in url:
+ return "landingpage"
+ if "dergipark.gov.tr" in url and not ("download/article-file" in url):
+ return "other"
+
+ try:
+ if isinstance(doc.html, str) and "<center><h1>403 Forbidden</h1></center>" in doc.html:
+ # cloudflare block pattern
+ return "blocked-forbidden"
+ except UnicodeDecodeError:
+ pass
+
+ print(f" scope guessing: platform {platform} word count: {word_count}", file=sys.stderr)
+
+ # fallback: guess based on word count (arbitrary guesses here)
+ if word_count is not None:
+ if word_count < 20:
+ return "stub"
+ elif word_count > 500 and platform in ["wordpress", "blogger"]:
+ return "article-fulltext"
+ elif word_count > 1200:
+ return "article-fulltext"
+
+ return "unknown"
+
+
+def run_single(
+ url: str, timestamp: Optional[str] = None, quick_mode: bool = False
+) -> IngestWebResult:
+
+ adblock = load_adblock_rules()
+ wayback_client = WaybackClient()
+
+ html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp)
+ if html_resource.status != "success":
+ return IngestWebResult(
+ status=html_resource.status,
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ )
+
+ assert html_resource.terminal_status_code == 200
+
+ file_meta = gen_file_metadata(html_resource.body)
+ file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
+
+ if file_meta["mimetype"] not in ("text/html", "text/xml"):
+ return IngestWebResult(
+ status="wrong-mimetype",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ )
+
+ html_doc = HTMLParser(html_resource.body)
+ html_biblio = html_extract_biblio(url, html_doc)
+ html_body = html_extract_body_teixml(html_resource.body)
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count"))
+ if html_scope not in ("article-fulltext", "unknown"):
+ return IngestWebResult(
+ status="wrong-scope",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ )
+
+ raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+ assert len(raw_resources) <= 200
+
+ when = parse_cdx_datetime(html_resource.cdx.datetime)
+
+ full_resources: List[WebResource] = []
+ if quick_mode:
+ full_resources = quick_fetch_html_resources(
+ raw_resources, wayback_client.cdx_client, when
+ )
+ else:
+ full_resources = fetch_html_resources(raw_resources, wayback_client, when)
+
+ output = IngestWebResult(
+ status="success",
+ hit=True,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_body=html_body,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ html_resources=full_resources,
+ )
+ return output
+
+
+def main() -> None:
+ """
+ Run this command like:
+
+ python -m sandcrawler.ingest_html
+ """
+
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ sub = subparsers.add_parser(
+ "single", help="tries to ingest a single URL, dumps result to stdout"
+ )
+ sub.set_defaults(func="run_single")
+ sub.add_argument(
+ "url",
+ help="URL to fetch",
+ type=str,
+ )
+ sub.add_argument(
+ "--timestamp",
+ help="timestamp for which to fetch document from wayback",
+ type=str,
+ )
+ sub.add_argument(
+ "--quick-mode",
+ help="don't fetch resources, only do CDX lookup",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ if args.func == "run_single":
+ result = run_single(args.url, args.timestamp, args.quick_mode)
+ print(result.json(indent=2, exclude_none=True))
+ else:
+ # func = getattr(wp, args.func)
+ # func()
+ raise NotImplementedError()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 8b02211..8836515 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,14 +1,18 @@
-
-import io
-import os
import hashlib
+import io
+from typing import Optional, Tuple, Union
import minio
class SandcrawlerMinioClient(object):
-
- def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+ def __init__(
+ self,
+ host_url: str,
+ access_key: str,
+ secret_key: str,
+ default_bucket: Optional[str] = None,
+ ):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -17,8 +21,8 @@ class SandcrawlerMinioClient(object):
Example config:
host="localhost:9000",
- access_key=os.environ['MINIO_ACCESS_KEY'],
- secret_key=os.environ['MINIO_SECRET_KEY'],
+ access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
"""
self.mc = minio.Minio(
host_url,
@@ -28,7 +32,7 @@ class SandcrawlerMinioClient(object):
)
self.default_bucket = default_bucket
- def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+ def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
if not extension:
extension = ""
if not prefix:
@@ -44,7 +48,15 @@ class SandcrawlerMinioClient(object):
)
return obj_path
- def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ def put_blob(
+ self,
+ folder: str,
+ blob: Union[str, bytes],
+ sha1hex: Optional[str] = None,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> Tuple[str, str]:
"""
blob should be bytes
sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -53,7 +65,7 @@ class SandcrawlerMinioClient(object):
filename is SHA1 with an optional file extension.
"""
if type(blob) == str:
- blob = blob.encode('utf-8')
+ blob = blob.encode("utf-8")
assert type(blob) == bytes
if not sha1hex:
h = hashlib.sha1()
@@ -64,13 +76,13 @@ class SandcrawlerMinioClient(object):
bucket = self.default_bucket
assert bucket
content_type = "application/octet-stream"
- if extension.endswith('.xml'):
+ if extension.endswith(".xml"):
content_type = "application/xml"
- if extension.endswith('.png'):
+ if extension.endswith(".png"):
content_type = "image/png"
- elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ elif extension.endswith(".jpg") or extension.endswith(".jpeg"):
content_type = "image/jpeg"
- elif extension.endswith('.txt'):
+ elif extension.endswith(".txt"):
content_type = "text/plain"
self.mc.put_object(
bucket,
@@ -81,7 +93,14 @@ class SandcrawlerMinioClient(object):
)
return (bucket, obj_path)
- def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ def get_blob(
+ self,
+ folder: str,
+ sha1hex: str,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> bytes:
"""
sha1hex is sha1 of the blob itself
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 1b8aa92..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,29 +1,70 @@
-
import base64
-import magic
-import hashlib
import datetime
+import hashlib
+import os
+from typing import List, Optional
+
+import magic
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
-def clean_url(s):
+def clean_url(s: str) -> str:
s = s.strip()
parsed = urlcanon.parse_url(s)
if not parsed.port and parsed.colon_before_port:
- parsed.colon_before_port = b''
+ parsed.colon_before_port = b""
return str(urlcanon.whatwg(parsed))
-def gen_file_metadata(blob):
+
+def url_fuzzy_equal(left: str, right: str) -> bool:
+ """
+ TODO: use proper surt library and canonicalization for this check
+ """
+ fuzzy_left = "://".join(
+ clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
+ fuzzy_right = "://".join(
+ clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
+ if fuzzy_left == fuzzy_right:
+ return True
+ elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
+ return True
+ return False
+
+
+def test_url_fuzzy_equal() -> None:
+ assert (
+ url_fuzzy_equal(
+ "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ )
+ is True
+ )
+
+
+def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
"""
Takes a file blob (bytestream) and returns hashes and other metadata.
Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
"""
- assert blob
- mimetype = magic.Magic(mime=True).from_buffer(blob)
+ assert blob is not None
+ if not allow_empty:
+ assert blob
+ if len(blob) < 1024 * 1024:
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ else:
+ mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
+ if mimetype in ("application/xml", "text/xml"):
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+ mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
hashlib.sha256(),
@@ -39,7 +80,50 @@ def gen_file_metadata(blob):
mimetype=mimetype,
)
-def b32_hex(s):
+
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+ """
+ Variant of gen_file_metadata() which works with files on local disk
+ """
+ assert path is not None
+ mimetype = magic.Magic(mime=True).from_file(path)
+ if mimetype in ("application/xml", "text/xml"):
+ with open(path, "rb") as f:
+ blob = f.read(1024)
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if (
+ b"<htm" in blob[:1024]
+ and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+ ):
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ size_bytes = 0
+ with open(path, "rb") as f:
+ while True:
+ chunk = f.read(1024 * 1024)
+ if not chunk:
+ break
+ size_bytes += len(chunk)
+ for h in hashes:
+ h.update(chunk)
+ if not allow_empty:
+ assert size_bytes > 0
+ return dict(
+ size_bytes=size_bytes,
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+
+def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -52,45 +136,45 @@ def b32_hex(s):
if len(s) == 40:
return s
raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
NORMAL_MIME = (
- 'application/pdf',
- 'application/postscript',
- 'text/html',
- 'text/xml',
- 'application/octet-stream',
+ "application/pdf",
+ "application/postscript",
+ "text/html",
+ "text/xml",
+ "application/octet-stream",
)
-def normalize_mime(raw):
+
+def normalize_mime(raw: str) -> Optional[str]:
raw = raw.lower().strip()
for norm in NORMAL_MIME:
if raw.startswith(norm):
return norm
# Special cases
- if raw.startswith('application/xml'):
- return 'text/xml'
- if raw.startswith('application/x-pdf'):
- return 'application/pdf'
- if raw in (
- '.pdf',
- ):
- return 'application/pdf'
+ if raw.startswith("application/xml"):
+ return "text/xml"
+ if raw.startswith("application/x-pdf"):
+ return "application/pdf"
+ if raw in (".pdf",):
+ return "application/pdf"
if raw in (
- 'application/download',
- 'binary/octet-stream',
- 'unk',
- 'application/x-download',
- 'application/octetstream',
- 'application/force-download',
- 'application/unknown',
- ):
- return 'application/octet-stream'
+ "application/download",
+ "binary/octet-stream",
+ "unk",
+ "application/x-download",
+ "application/octetstream",
+ "application/force-download",
+ "application/unknown",
+ ):
+ return "application/octet-stream"
return None
-def test_normalize_mime():
+def test_normalize_mime() -> None:
assert normalize_mime("asdf") is None
assert normalize_mime("application/pdf") == "application/pdf"
assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -103,7 +187,7 @@ def test_normalize_mime():
assert normalize_mime("binary/octet-stream") == "application/octet-stream"
-def parse_cdx_line(raw_cdx, normalize=True):
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
"""
This method always filters a few things out:
@@ -124,46 +208,81 @@ def parse_cdx_line(raw_cdx, normalize=True):
offset = cdx[9]
warc = cdx[10]
- if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
- and len(sha1b32) == 32 and dt.isdigit()):
+ if not (
+ sha1b32.isalnum()
+ and c_size.isdigit()
+ and offset.isdigit()
+ and len(sha1b32) == 32
+ and dt.isdigit()
+ ):
return None
- if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
return None
- if mime is None or mime == '-':
+ if mime is None or mime == "-":
mime = "application/octet-stream"
if normalize:
mime = normalize_mime(mime)
sha1hex = b32_hex(sha1b32)
- http_status = int(http_status)
- c_size = int(c_size)
- offset = int(offset)
return dict(
surt=surt,
url=url,
datetime=dt,
mimetype=mime,
- http_status=http_status,
+ http_status=int(http_status),
sha1b32=sha1b32,
sha1hex=sha1hex,
- warc_csize=c_size,
- warc_offset=offset,
+ warc_csize=int(c_size),
+ warc_offset=int(offset),
warc_path=warc,
)
-def parse_cdx_datetime(dt_str):
+
+def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
+ if not dt_str:
+ return None
try:
- return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+ return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S")
except Exception:
return None
-def requests_retry_session(retries=10, backoff_factor=3,
- status_forcelist=(500, 502, 504), session=None):
+def test_parse_cdx_datetime() -> None:
+ assert parse_cdx_datetime("") is None
+ assert parse_cdx_datetime("asdf") is None
+ assert parse_cdx_datetime("19930203123045") is not None
+ assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+ year=2020, month=10, day=28, hour=23, minute=51, second=3
+ )
+
+
+def datetime_to_cdx(dt: datetime.datetime) -> str:
+ return "%04d%02d%02d%02d%02d%02d" % (
+ dt.year,
+ dt.month,
+ dt.day,
+ dt.hour,
+ dt.minute,
+ dt.second,
+ )
+
+
+def test_datetime_to_cdx() -> None:
+ assert "20201028235103" == datetime_to_cdx(
+ datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+ )
+
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 1,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: Optional[requests.Session] = None,
+) -> requests.Session:
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
@@ -176,7 +295,23 @@ def requests_retry_session(retries=10, backoff_factor=3,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
return session
+
+def sanitize_fs_path(path: str) -> str:
+ """
+ From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+ """
+ # - pretending to chroot to the current directory
+ # - cancelling all redundant paths (/.. = /)
+ # - making the path relative
+ return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+
+def test_sanitize_fs_path() -> None:
+ assert sanitize_fs_path("/thing.png") == "thing.png"
+ assert sanitize_fs_path("../../thing.png") == "thing.png"
+ assert sanitize_fs_path("thing.png") == "thing.png"
+ assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 4606632..97d338e 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,17 +1,174 @@
-
-import sys
-import json
import datetime
-from io import BytesIO
+import json
+import sys
from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
import poppler
from PIL import Image
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .ia import WaybackClient
from .misc import gen_file_metadata
-from .ia import WaybackClient, WaybackError, PetaboxError
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
+# This is a hack to work around timeouts when processing certain PDFs with
+# poppler. For some reason, the usual Kafka timeout catcher isn't working on
+# these, maybe due to threading.
+BAD_PDF_SHA1HEX: List[str] = [
+ "011478a1e63a2a31eae1a93832a74cc95f220760",
+ "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
+ "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
+ "06061af0707298c12932516d1bb7c2b6dc443824",
+ "0641822e68c5a07538b967489fd19a1d5dc371a5",
+ "09cba9b00494d12759c50cb914f1fb7c9746f5d1",
+ "09db7c9f2efb496c974427a61e84292ae27fc702",
+ "0a1c13cb8783bbbf248b2345b9890e2410aa3f0a",
+ "0ccc6dc94f4e2d809fac8543870265c3421f3c9e",
+ "0d1c1567ea70e7b922ba88ccb868ffc7ca18e75c",
+ "10c6577a658bf6203557e2998b25ea9788f8adfe",
+ "15a720921ce30da983fcd1bfa7fe9aeeda503e41",
+ "1659881a31edc2d0e170f6bb26d32e74cc4ca387",
+ "17e679b0ec9444fff2ea4d02caec05dd2de80ec3",
+ "182749ad1db1d5e999d07f010bdcfc2978dadc88",
+ "1a17a4fc43397804830cc29021281aac2e8cf0cb",
+ "1cb166f0c0b5ffe673e6bbf6a29d77278711f253",
+ "1d04e46b6848e6479dd90fe26bb11627044fb664",
+ "1d967c95546d31edaaf0c3ef9ffcc11113a9e11a",
+ "1f90194bf0c7fff1fe1ed5fff77a934c7a1b32a0",
+ "20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
+ "2195e528fa1cf5f8ae3b2adcc516896016c3411f",
+ "25ab9e6169f041be05844a9b4edd6574918af769",
+ "281de904c4642a9be4f17b9774fc0a2bdc8a90e3",
+ "2bd5322975653536550a039eb055174b2bf241b3",
+ "2fc64da736175810918fd32c94c5068b0d660bcc",
+ "32318fba9b05b2756b7362bcaa4722c92ed8d449",
+ "336833c6fc968cd0938250dfc93c032a30111cfc",
+ "362ad00bc24d650c8f11851f9e554fc560b73e7a",
+ "373f84dfab4ed47047826e604e2918a9cd6a95b2",
+ "3ac0b6e17e30d141871a0a5b127536919fe5aa19",
+ "3c8a6a708da0dc1802f5f3e5267a49b3c25e1ffe",
+ "3e5f9fb94e7314447a22f3d009419a922136177f",
+ "3fad493c940137ce703f2f570ebb504e360c6df3",
+ "40aa94602ab13e5a7d9df8c989fca4fa5c01239e",
+ "427479c94d7d0e512f898bc7ff0b6f210069f902",
+ "436c9183724f051b22c96285aa8ff1d2ba709574",
+ "43a8c0abf0386d3e3397cf5e22a884761dd63db7",
+ "445968ef735b228c08c3ff4238d99fc9f4824619",
+ "447fa6b5a90742a86429a932f6608d8e141688c0",
+ "45f014d7d631559dc7726e5c5513f1e7c91c48a9",
+ "47577ff6d6876117ca69bec60a5764f7d2c2ec70",
+ "4785181cec8944eee00ddb631a5dfc771b89bab7",
+ "47db2db2cc976429568841a0496c0ab4ed7b5977",
+ "481c0bae81873988fcc8662ba8a269e8823fdea2",
+ "4c81129904f7976a50825595a3497ea7b52579ef",
+ "4edc1402712fa6827c4501fed8042e9f4447829c",
+ "50b3c5a3122272aca69855ef06b85d0b43a76eb1",
+ "52fc9b3c5199ef395d410c7cee5961dc812e4d29",
+ "53471346019947a88c1ba141fb829375527153b0",
+ "58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
+ "59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
+ "5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5c5b45c85eff07d4302844e00ec8baa57b988c60",
+ "5e04779cbbae5ce88bb786064f756885dd6895fe",
+ "5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "62247fe6b8d3ca50477cafddbe24bf63832d6674",
+ "623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
+ "646c4a654270606256397684204ff0f3d17be2e7",
+ "64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "668b7d777203af4b261d21bf4669fc9b385062e1",
+ "689b5cb3ddef213d612363a903f10d0358ea64d2",
+ "6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
+ "74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "7596438d77444a7c4228bb96fa4b394ba7d7e23b",
+ "75c2662a96ccc48891228df7c85eb7d4da9dd621",
+ "771f1ca0007a6fbed5b4a434c73f524f715d33c1",
+ "776859635e9dc01d97b0582f49c814ffbcb019fb",
+ "781dafda896a9f5c30f3d0a011f79a3b79b574c4",
+ "788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
+ "79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7b8b7e8e4b789579a7d2fda329db52528383a652",
+ "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e",
+ "7cfc0739be9c49d94272110a0a748256bdde9be6",
+ "7daf61526ec825151f384cc1db510ca5237d5d80",
+ "7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae",
+ "8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
+ "859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
+ "88edcbab1cac2d70af5870422974afc253f4f0c6",
+ "89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
+ "8dcaf4ef132900dd378f7be526c884b17452713b",
+ "8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "8ec1a17ec19ae8ade95b9bdc837236981e83fffb",
+ "949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
+ "961ec451172f373f919c593737466300e42062cb",
+ "976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "977f23723027d7052df9b49eb467e6c0b9af93ff",
+ "98b02eb70066c182c705ef4d14d8b723ad7f1fab",
+ "993ca31f6974f8387bb18dd7d38987d290da8781",
+ "9dbd05af3442e6f42d67868054751b76973f4171",
+ "a1cc781c694a48e018f4de110b58f561aa212051",
+ "a2298c137b9c8c8975bad62eea9224edb95e6952",
+ "a2671738755ab8b24775e95375dc72f1ca4e5fd6",
+ "a26f299fb97c646effeebd4c5e2968786bd0f781",
+ "a48f9b7ad627909f76d780aa4208530304ece42c",
+ "a69665d0b5d3b95f54f68406eee3ed50c67efb45",
+ "a69665d0b5d3b95f54f68406eee3ed50c67efb45",
+ "a8357c31837404f9ebd798999d546c9398ab3648",
+ "a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "abc9d264df446707b40d7c9f79befd0f89291e59",
+ "ad038725bf6855a79f3c768ebe93c7103d14522f",
+ "aef581bf42e76e527f5aed3b8958fd4e7a24819f",
+ "b2b66b9c7f817a20144456f99c0be805602e8597",
+ "b2d719120306b90eb8dd3580b699a61ec70556f4",
+ "b4b8e18e27f102e59b2be2d58c7b54d0a0eb457a",
+ "b5be7f409a3a2601208c5ce08cf52b9ac1094aae",
+ "b5bf8b7467fb095c90adf3b49aa1687291e4469c",
+ "b8b427e5b3d650ba9e03197f9c3917e25b878930",
+ "bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
+ "be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "beff1b0c24aa99989be73c66dfb1d1e7578e370b",
+ "c1b583fbd052572f08158d39ffe4d7510dadbebb",
+ "c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
+ "c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c58e028269c8dfd3a442f6745c81b4c0e8610c43",
+ "c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
+ "c7687fa6f637c7d32a25be0e772867d87536d35c",
+ "c7d8b37ec99cf0d987e60667f05299f200e18a5d",
+ "c92b9ae9eefa07504950b405625aef54b48f0e1a",
+ "ccb1debcfae006a3fc984e9e91309b9706a5c375",
+ "cd611c765cbb0b3b7cb2fdc07d8f0b9cc93ec257",
+ "cd8a7c3b8d850ebedc1ca791ccb37b9a2689f9c3",
+ "d055c054c330f99ec011e37186d2b429339758fd",
+ "d17b1e254cce82df5c6eb4fd492cef91e7e11558",
+ "d188762a7e3ab5d4ee8a897204316513e4e636ec",
+ "d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
+ "d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "d91d3830bf455e6dd782eee46218e35d29f07dfd",
+ "da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "dbb3093a797e0ae83d39eb7b235ff85a17fd965c",
+ "e01bb7256d77aea258313bb410dfcfc10512f420",
+ "e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
+ "e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
+ "e64714a81f60ab9286ec90cad682cb22e564fb6f",
+ "e9d7716b4f94bbc3d94459b5fe9bb8b15cb2e433",
+ "e9e84e17383e93a784a8471708619162b32fb399",
+ "eac7df5f799983d5a7cc55d10b4d426dc557febf",
+ "eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
+ "eb1b39fd7a874896688855a22efddef10272427c",
+ "eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ecc4b927c5e84e145c610876931bc261ae13769b",
+ "edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
+ "ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
+ "ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
+ "ef1dfa325c21cff4cd8bb1a9b6c4ee6996d43c8f",
+ "ef6749d9263a01f921ba7d72df0d17671d14e5f6",
+ "f0ea221d8587cede25592266486e119d277f7096",
+ "f68f9a9202a75d2aee35252e104d796f9515001e",
+ "f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "f932ef936021a3b00842b481478c40868b9a007c",
+ "fd9bd560662e070b222d63052830837829c490f0",
+]
@dataclass
@@ -19,54 +176,84 @@ class PdfExtractResult:
sha1hex: str
status: str
error_msg: Optional[str] = None
- file_meta: Optional[Dict[str,Any]] = None
+ file_meta: Optional[Dict[str, Any]] = None
text: Optional[str] = None
page0_thumbnail: Optional[bytes] = None
has_page0_thumbnail: bool = False
meta_xml: Optional[str] = None
- pdf_info: Optional[Dict[str,Any]] = None
- pdf_extra: Optional[Dict[str,Any]] = None
- source: Optional[Dict[str,Any]] = None
+ pdf_info: Optional[Dict[str, Any]] = None
+ pdf_extra: Optional[Dict[str, Any]] = None
+ source: Optional[Dict[str, Any]] = None
def to_pdftext_dict(self) -> dict:
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
return {
- 'key': self.sha1hex,
- 'sha1hex': self.sha1hex,
- 'status': self.status,
- 'file_meta': self.file_meta,
- 'error_msg': self.error_msg,
- 'text': self.text,
- 'has_page0_thumbnail': self.has_page0_thumbnail,
- 'meta_xml': self.meta_xml,
- 'pdf_info': self.pdf_info,
- 'pdf_extra': self.pdf_extra,
- 'source': self.source,
+ "key": self.sha1hex,
+ "sha1hex": self.sha1hex,
+ "status": self.status,
+ "file_meta": self.file_meta,
+ "error_msg": self.error_msg,
+ "text": self.text,
+ "has_page0_thumbnail": self.has_page0_thumbnail,
+ "meta_xml": self.meta_xml,
+ "pdf_info": self.pdf_info,
+ "pdf_extra": self.pdf_extra,
+ "source": self.source,
}
- @classmethod
- def from_pdftext_dict(cls, record):
+ @staticmethod
+ def from_pdftext_dict(record: Dict[str, Any]) -> "PdfExtractResult":
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
- if record['status'] != 'success':
+ if record["status"] != "success":
+ return PdfExtractResult(
+ sha1hex=record.get("sha1hex") or record["key"],
+ status=record["status"],
+ error_msg=record.get("error_msg"),
+ )
+ else:
+ return PdfExtractResult(
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ file_meta=record.get("file_meta"),
+ text=record.get("text"),
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ meta_xml=record.get("meta_xml"),
+ pdf_info=record.get("pdf_info"),
+ pdf_extra=record.get("pdf_extra"),
+ )
+
+ @staticmethod
+ def from_pdf_meta_dict(record: Dict[str, Any]) -> "PdfExtractResult":
+ """
+ Parses what would be returned from postgrest
+ """
+ if record["status"] != "success":
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- error_msg=record.get('error_msg'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ error_msg=(record.get("metadata") or {}).get("error_msg"),
)
else:
+ pdf_extra = dict()
+ for k in (
+ "page_count",
+ "page0_height",
+ "page0_width",
+ "permanent_id",
+ "pdf_version",
+ ):
+ if record.get(k):
+ pdf_extra[k] = record[k]
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- file_meta=record.get('file_meta'),
- text=record.get('text'),
- has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
- meta_xml=record.get('meta_xml'),
- pdf_info=record.get('pdf_info'),
- pdf_extra=record.get('pdf_extra'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ pdf_info=record.get("metadata"),
+ pdf_extra=pdf_extra,
)
def to_sql_tuple(self) -> tuple:
@@ -82,31 +269,33 @@ class PdfExtractResult:
# TODO: form, encrypted
if self.pdf_info:
metadata = dict()
- for k in ('Title', 'Subject', 'Author', 'Creator', 'Producer', 'doi'):
+ for k in ("Title", "Subject", "Author", "Creator", "Producer", "doi"):
if k in self.pdf_info:
metadata[k.lower()] = self.pdf_info[k]
- if 'CreationDate' in self.pdf_info:
- pdf_created = self.pdf_info['CreationDate']
+ if "CreationDate" in self.pdf_info:
+ pdf_created = self.pdf_info["CreationDate"]
metadata_json: Optional[str] = None
if metadata:
metadata_json = json.dumps(metadata, sort_keys=True)
return (
self.sha1hex,
- datetime.datetime.now(), # updated
+ datetime.datetime.now(), # updated
self.status,
self.has_page0_thumbnail,
- pdf_extra.get('page_count'),
+ pdf_extra.get("page_count"),
word_count,
- pdf_extra.get('page0_height'),
- pdf_extra.get('page0_width'),
- pdf_extra.get('permanent_id'),
+ pdf_extra.get("page0_height"),
+ pdf_extra.get("page0_width"),
+ pdf_extra.get("permanent_id"),
pdf_created,
- pdf_extra.get('pdf_version'),
+ pdf_extra.get("pdf_version"),
metadata_json,
)
-def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult:
+def process_pdf(
+ blob: bytes, thumb_size: Tuple[int, int] = (180, 300), thumb_type: str = "JPEG"
+) -> PdfExtractResult:
"""
A known issue is that output text is in "physical layout" mode, which means
columns will be side-by-side. We would prefer a single stream of tokens!
@@ -116,21 +305,30 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
didn't seem to work at all (returned empty strings).
"""
file_meta = gen_file_metadata(blob)
- sha1hex = file_meta['sha1hex']
- if file_meta['mimetype'] != 'application/pdf':
+ sha1hex = file_meta["sha1hex"]
+ if file_meta["mimetype"] != "application/pdf":
return PdfExtractResult(
sha1hex=sha1hex,
- status='not-pdf',
+ status="not-pdf",
error_msg=f"mimetype is '{file_meta['mimetype']}'",
file_meta=file_meta,
)
+ if sha1hex in BAD_PDF_SHA1HEX:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="bad-pdf",
+ error_msg="PDF known to cause processing issues",
+ file_meta=file_meta,
+ )
+
+ print(f"\tpoppler processing: {sha1hex}", file=sys.stderr)
try:
pdf = poppler.load_from_data(blob)
if pdf is None:
return PdfExtractResult(
sha1hex=sha1hex,
- status='empty-pdf',
+ status="empty-pdf",
file_meta=file_meta,
has_page0_thumbnail=False,
)
@@ -138,17 +336,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if page0 is None:
return PdfExtractResult(
sha1hex=sha1hex,
- status='empty-page0',
+ status="empty-page0",
file_meta=file_meta,
)
# this call sometimes fails an returns an AttributeError
page0rect = page0.page_rect()
- except AttributeError as e:
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
# may need to expand the set of exceptions caught here over time, but
# starting with a narrow set
return PdfExtractResult(
sha1hex=sha1hex,
- status='parse-error',
+ status="parse-error",
error_msg=str(e),
file_meta=file_meta,
)
@@ -158,7 +357,9 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
renderer = poppler.PageRenderer()
try:
full_img = renderer.render_page(page0)
- img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "RGBA", 0, 1)
+ img = Image.frombuffer(
+ "RGBA", (full_img.width, full_img.height), full_img.data, "raw", "BGRA", 0, 1
+ )
img.thumbnail(thumb_size, Image.BICUBIC)
buf = BytesIO()
img.save(buf, thumb_type)
@@ -170,11 +371,45 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
print(str(e), file=sys.stderr)
page0_thumbnail = None
- full_text = page0.text()
- for n in range(1, pdf.pages):
- pageN = pdf.create_page(n)
- full_text += pageN.text()
- pdf_info = pdf.infos()
+ try:
+ full_text = page0.text()
+ for n in range(1, pdf.pages):
+ pageN = pdf.create_page(n)
+ full_text += pageN.text()
+ except AttributeError as e:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="parse-error",
+ error_msg=str(e),
+ file_meta=file_meta,
+ )
+
+ # Kafka message size limit; cap at about 1 MByte
+ if len(full_text) > 1000000:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="text-too-large",
+ error_msg="full_text chars: {}".format(len(full_text)),
+ file_meta=file_meta,
+ )
+ if len(pdf.metadata) > 1000000:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="text-too-large",
+ error_msg="meta_xml chars: {}".format(len(full_text)),
+ file_meta=file_meta,
+ )
+
+ try:
+ pdf_info = pdf.infos()
+ except UnicodeDecodeError:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="bad-unicode",
+ error_msg="in infos()",
+ file_meta=file_meta,
+ )
+
# TODO: is this actually needed? or does json marshalling work automatically?
for k in pdf_info.keys():
if isinstance(pdf_info[k], datetime.datetime):
@@ -191,7 +426,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
return PdfExtractResult(
sha1hex=sha1hex,
file_meta=file_meta,
- status='success',
+ status="success",
error_msg=None,
text=full_text or None,
has_page0_thumbnail=page0_thumbnail is not None,
@@ -208,30 +443,34 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
),
)
-class PdfExtractWorker(SandcrawlerFetchWorker):
- def __init__(self, wayback_client=None, sink=None, **kwargs):
+class PdfExtractWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
super().__init__(wayback_client=wayback_client)
self.wayback_client = wayback_client
self.sink = sink
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
- def timeout_response(self, task) -> Dict:
- default_key = task['sha1hex']
+ def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
- error_msg="internal GROBID worker timeout",
+ error_msg="internal pdf-extract worker timeout",
source=task,
sha1hex=default_key,
)
- def process(self, record, key: Optional[str] = None):
- default_key = record['sha1hex']
-
+ def process(self, record: Any, key: Optional[str] = None) -> dict:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
result = process_pdf(blob)
result.source = record
@@ -239,18 +478,19 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
return result.to_pdftext_dict()
+
class PdfExtractBlobWorker(SandcrawlerWorker):
"""
This is sort of like PdfExtractWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__()
self.sink = sink
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
- def process(self, blob, key: Optional[str] = None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
assert isinstance(blob, bytes)
@@ -260,4 +500,3 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
return result.to_pdftext_dict()
-
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index c65b6c8..112df6a 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,19 +1,19 @@
-
import time
+from typing import Any, Dict, Optional
+
import requests
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
-from .ia import WaybackClient, WaybackError, PetaboxError
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
-
- def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- def classify_pdf(self, blob, mode="auto"):
+ def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
"""
Returns a dict with at least:
@@ -26,45 +26,43 @@ class PdfTrioClient(object):
appropriately; an optional `error_msg` may also be set. For some other
errors, like connection failure, an exception is raised.
"""
- assert blob
+ assert blob and type(blob) == bytes
try:
- pdftrio_response = requests.post(
+ pdftrio_response = self.http_session.post(
self.host_url + "/classify/research-pub/" + mode,
files={
- 'pdf_content': blob,
+ "pdf_content": blob,
},
timeout=60.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'pdftrio request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "pdftrio request (HTTP POST) timeout",
}
except requests.exceptions.ConnectionError:
# crude back-off
time.sleep(2.0)
return {
- 'status': 'error-connect',
- 'status_code': -2, # heritrix3 "HTTP connect" code
- 'error_msg': 'pdftrio request connection timout',
+ "status": "error-connect",
+ "status_code": -2, # heritrix3 "HTTP connect" code
+ "error_msg": "pdftrio request connection timeout",
}
- info = dict(
- status_code=pdftrio_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
if pdftrio_response.status_code == 200:
resp_json = pdftrio_response.json()
- assert 'ensemble_score' in resp_json
- assert 'status' in resp_json
- assert 'versions' in resp_json
+ assert "ensemble_score" in resp_json
+ assert "status" in resp_json
+ assert "versions" in resp_json
info.update(resp_json)
else:
- info['status'] = 'error'
+ info["status"] = "error"
# TODO: might return JSON with some info?
- info['_total_sec'] = pdftrio_response.elapsed.total_seconds()
+ info["_total_sec"] = pdftrio_response.elapsed.total_seconds()
return info
@@ -73,59 +71,72 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
This class is basically copied directly from GrobidWorker
"""
- def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
- super().__init__(wayback_client=wayback_client)
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs
+ ):
+ super().__init__(wayback_client=wayback_client, **kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
- def process(self, record, key=None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
- default_key = record['sha1hex']
fetch_sec = None
start = time.time()
fetch_result = self.fetch_blob(record)
fetch_sec = time.time() - start
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
- result['source'] = record
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob)
+ result["source"] = record
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
if fetch_sec:
- result['timing']['fetch_sec'] = fetch_sec
+ result["timing"]["fetch_sec"] = fetch_sec
return result
+
class PdfTrioBlobWorker(SandcrawlerWorker):
"""
This is sort of like PdfTrioWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
- super().__init__()
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs
+ ):
+ super().__init__(**kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
self.mode = mode
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
if not blob:
return None
+ assert isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
return result
-
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index fbc5273..f682572 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -1,4 +1,3 @@
-
"""
cdx
- read raw CDX, filter
@@ -20,293 +19,415 @@ grobid
"""
import os
-from typing import Optional
+import time
import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
+
+import psycopg2
+import requests
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.grobid import GrobidClient
+from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.workers import SandcrawlerWorker
class PersistCdxWorker(SandcrawlerWorker):
-
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
# filter to full CDX lines, no liveweb
- cdx_batch = [r for r in batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ cdx_batch = [r for r in batch if r.get("warc_path") and ("/" in r["warc_path"])]
resp = self.db.insert_cdx(self.cur, cdx_batch)
if len(cdx_batch) < len(batch):
- self.counts['skip'] += len(batch) - len(cdx_batch)
- self.counts['insert-cdx'] += resp[0]
- self.counts['update-cdx'] += resp[1]
+ self.counts["skip"] += len(batch) - len(cdx_batch)
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
self.db.commit()
return []
-class PersistIngestFileResultWorker(SandcrawlerWorker):
- def __init__(self, db_url, **kwargs):
+class PersistIngestFileResultWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def request_to_row(self, raw):
+ def request_to_row(self, raw: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema
if there is a problem with conversion, return None
"""
# backwards compat hacks; transform request to look like current schema
- if raw.get('ingest_type') == 'file':
- raw['ingest_type'] = 'pdf'
- if (not raw.get('link_source')
- and raw.get('base_url')
- and raw.get('ext_ids', {}).get('doi')
- and raw['base_url'] == "https://doi.org/{}".format(raw['ext_ids']['doi'])):
+ if raw.get("ingest_type") == "file":
+ raw["ingest_type"] = "pdf"
+ if (
+ not raw.get("link_source")
+ and raw.get("base_url")
+ and raw.get("ext_ids", {}).get("doi")
+ and raw["base_url"] == "https://doi.org/{}".format(raw["ext_ids"]["doi"])
+ ):
# set link_source(_id) for old ingest requests
- raw['link_source'] = 'doi'
- raw['link_source_id'] = raw['ext_ids']['doi']
- if (not raw.get('link_source')
- and raw.get('ingest_request_source', '').startswith('savepapernow')
- and raw.get('fatcat', {}).get('release_ident')):
+ raw["link_source"] = "doi"
+ raw["link_source_id"] = raw["ext_ids"]["doi"]
+ if (
+ not raw.get("link_source")
+ and raw.get("ingest_request_source", "").startswith("savepapernow")
+ and raw.get("fatcat", {}).get("release_ident")
+ ):
# set link_source(_id) for old ingest requests
- raw['link_source'] = 'spn'
- raw['link_source_id'] = raw['fatcat']['release_ident']
+ raw["link_source"] = "spn"
+ raw["link_source_id"] = raw["fatcat"]["release_ident"]
- for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'):
- if not k in raw:
- self.counts['skip-request-fields'] += 1
+ for k in ("ingest_type", "base_url", "link_source", "link_source_id"):
+ if k not in raw:
+ self.counts["skip-request-fields"] += 1
return None
- if raw['ingest_type'] not in ('pdf', 'xml'):
- print(raw['ingest_type'])
- self.counts['skip-ingest-type'] += 1
+ if raw["ingest_type"] not in ("pdf", "xml", "html"):
+ self.counts["skip-ingest-type"] += 1
+ return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
return None
request = {
- 'ingest_type': raw['ingest_type'],
- 'base_url': raw['base_url'],
- 'link_source': raw['link_source'],
- 'link_source_id': raw['link_source_id'],
- 'ingest_request_source': raw.get('ingest_request_source'),
- 'request': {},
+ "ingest_type": raw["ingest_type"],
+ "base_url": raw["base_url"],
+ "link_source": raw["link_source"],
+ "link_source_id": raw["link_source_id"],
+ "ingest_request_source": raw.get("ingest_request_source"),
+ "request": {},
}
# extra/optional fields
- if raw.get('release_stage'):
- request['release_stage'] = raw['release_stage']
- if raw.get('fatcat', {}).get('release_ident'):
- request['request']['release_ident'] = raw['fatcat']['release_ident']
- for k in ('ext_ids', 'edit_extra', 'rel'):
+ if raw.get("release_stage"):
+ request["release_stage"] = raw["release_stage"]
+ if raw.get("fatcat", {}).get("release_ident"):
+ request["request"]["release_ident"] = raw["fatcat"]["release_ident"]
+ for k in ("ext_ids", "edit_extra", "rel"):
if raw.get(k):
- request['request'][k] = raw[k]
+ request["request"][k] = raw[k]
# if this dict is empty, trim it to save DB space
- if not request['request']:
- request['request'] = None
+ if not request["request"]:
+ request["request"] = None
return request
-
- def file_result_to_row(self, raw):
+ def file_result_to_row(self, raw: dict) -> Optional[dict]:
"""
Converts ingest-result JSON schema (eg, from Kafka) to SQL ingest_file_result schema
if there is a problem with conversion, return None and set skip count
"""
- for k in ('request', 'hit', 'status'):
- if not k in raw:
- self.counts['skip-result-fields'] += 1
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ self.counts["skip-result-fields"] += 1
return None
- if not 'base_url' in raw['request']:
- self.counts['skip-result-fields'] += 1
+ if "base_url" not in raw["request"]:
+ self.counts["skip-result-fields"] += 1
return None
- ingest_type = raw['request'].get('ingest_type')
- if ingest_type == 'file':
- ingest_type = 'pdf'
- if ingest_type not in ('pdf', 'xml'):
- self.counts['skip-ingest-type'] += 1
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type == "file":
+ ingest_type = "pdf"
+ if ingest_type not in (
+ "pdf",
+ "xml",
+ "html",
+ "component",
+ "src",
+ "dataset",
+ "dataset-file",
+ ):
+ self.counts["skip-ingest-type"] += 1
return None
- if raw['status'] in ("existing", ):
- self.counts['skip-existing'] += 1
+ if raw["status"] in ("existing",):
+ self.counts["skip-existing"] += 1
return None
result = {
- 'ingest_type': ingest_type,
- 'base_url': raw['request']['base_url'],
- 'hit': raw['hit'],
- 'status': raw['status'],
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
}
- terminal = raw.get('terminal')
+ terminal = raw.get("terminal")
if terminal:
- result['terminal_url'] = terminal.get('terminal_url') or terminal.get('url')
- result['terminal_dt'] = terminal.get('terminal_dt')
- result['terminal_status_code'] = terminal.get('terminal_status_code') or terminal.get('status_code') or terminal.get('http_code')
- if result['terminal_status_code']:
- result['terminal_status_code'] = int(result['terminal_status_code'])
- result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
+ result["terminal_url"] = terminal.get("terminal_url") or terminal.get("url")
+ result["terminal_dt"] = terminal.get("terminal_dt")
+ result["terminal_status_code"] = (
+ terminal.get("terminal_status_code")
+ or terminal.get("status_code")
+ or terminal.get("http_code")
+ )
+ if result["terminal_status_code"]:
+ result["terminal_status_code"] = int(result["terminal_status_code"])
+ result["terminal_sha1hex"] = terminal.get("terminal_sha1hex")
+ if len(result["terminal_url"]) > 2048:
+ # postgresql13 doesn't like extremely large URLs in b-tree index
+ self.counts["skip-huge-url"] += 1
+ return None
+ return result
+
+ def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
+ html_body = record.get("html_body")
+ file_meta = record.get("file_meta")
+ if not (file_meta and html_body):
+ return None
+ return HtmlMetaRow(
+ sha1hex=file_meta["sha1hex"],
+ status=record.get("status"),
+ scope=record.get("scope"),
+ has_teixml=bool(html_body and html_body["status"] == "success"),
+ has_thumbnail=False, # TODO
+ word_count=(html_body and html_body.get("word_count")) or None,
+ biblio=record.get("html_biblio"),
+ resources=record.get("html_resources"),
+ )
+
+ def result_to_platform_row(self, raw: dict) -> Optional[dict]:
+ """
+ Converts fileset ingest-result JSON schema (eg, from Kafka) to SQL ingest_fileset_platform schema
+
+ if there is a problem with conversion, return None and set skip count
+ """
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ return None
+ if "base_url" not in raw["request"]:
+ return None
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type not in ("dataset"):
+ return None
+ if raw["status"] in ("existing",):
+ return None
+ if not raw.get("platform_name"):
+ return None
+ result = {
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
+ "platform_name": raw.get("platform_name"),
+ "platform_domain": raw.get("platform_domain"),
+ "platform_id": raw.get("platform_id"),
+ "ingest_strategy": raw.get("ingest_strategy"),
+ "total_size": raw.get("total_size"),
+ "file_count": raw.get("file_count"),
+ "archiveorg_item_name": raw.get("archiveorg_item_name"),
+ "archiveorg_item_bundle_path": None,
+ "web_bundle_url": None,
+ "web_bundle_dt": None,
+ "manifest": raw.get("manifest"),
+ }
+ if result.get("fileset_bundle"):
+ result["archiveorg_item_bundle_path"] = result["fileset_bundle"].get(
+ "archiveorg_item_bundle_path"
+ )
+ result["web_bundle_url"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_url")
+ )
+ result["web_bundle_dt"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_dt")
+ )
return result
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: List[Any]) -> List[Any]:
+ self.counts["total"] += len(batch)
if not batch:
return []
- results = [self.file_result_to_row(raw) for raw in batch]
- results = [r for r in results if r]
+ results_unfiltered = [self.file_result_to_row(raw) for raw in batch]
+ results = [r for r in results_unfiltered if r]
- requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
- requests = [r for r in requests if r]
+ irequests_unfiltered = [
+ self.request_to_row(raw["request"]) for raw in batch if raw.get("request")
+ ]
+ irequests = [
+ r for r in irequests_unfiltered if r and r["ingest_type"] != "dataset-file"
+ ]
- if requests:
- resp = self.db.insert_ingest_request(self.cur, requests)
- self.counts['insert-requests'] += resp[0]
- self.counts['update-requests'] += resp[1]
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
if results:
resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update")
- self.counts['insert-results'] += resp[0]
- self.counts['update-results'] += resp[1]
+ self.counts["insert-results"] += resp[0]
+ self.counts["update-results"] += resp[1]
# these schemas match, so can just pass through
- cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx')]
- revisit_cdx_batch = [r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx')]
+ cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")]
+ revisit_cdx_batch = [
+ r["revisit_cdx"] for r in batch if r.get("hit") and r.get("revisit_cdx")
+ ]
cdx_batch.extend(revisit_cdx_batch)
# filter to full CDX lines, with full warc_paths (not liveweb)
- cdx_batch = [r for r in cdx_batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ cdx_batch = [r for r in cdx_batch if r.get("warc_path") and ("/" in r["warc_path"])]
if cdx_batch:
resp = self.db.insert_cdx(self.cur, cdx_batch)
- self.counts['insert-cdx'] += resp[0]
- self.counts['update-cdx'] += resp[1]
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
- file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("hit") and r.get("file_meta")]
if file_meta_batch:
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing")
- self.counts['insert-file_meta'] += resp[0]
- self.counts['update-file_meta'] += resp[1]
+ self.counts["insert-file_meta"] += resp[0]
+ self.counts["update-file_meta"] += resp[1]
+
+ html_meta_batch = [
+ self.result_to_html_meta(r) for r in batch if r.get("hit") and r.get("html_body")
+ ]
+ if html_meta_batch:
+ rows = [d.to_sql_tuple() for d in html_meta_batch if d]
+ resp = self.db.insert_html_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-html_meta"] += resp[0]
+ self.counts["update-html_meta"] += resp[1]
+
+ fileset_platform_batch_all = [
+ self.result_to_platform_row(raw)
+ for raw in batch
+ if raw.get("request", {}).get("ingest_type") == "dataset"
+ and raw.get("platform_name")
+ ]
+ fileset_platform_batch: List[Dict] = [p for p in fileset_platform_batch_all if p]
+ if fileset_platform_batch:
+ resp = self.db.insert_ingest_fileset_platform(
+ self.cur, fileset_platform_batch, on_conflict="update"
+ )
+ self.counts["insert-fileset_platform"] += resp[0]
+ self.counts["update-fileset_platform"] += resp[1]
self.db.commit()
return []
-class PersistIngestRequestWorker(PersistIngestFileResultWorker):
- def __init__(self, db_url, **kwargs):
+class PersistIngestFilesetWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__(db_url=db_url)
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
if not batch:
return []
- requests = [self.request_to_row(raw) for raw in batch]
- requests = [r for r in requests if r]
+ irequests_all = [self.request_to_row(raw) for raw in batch]
+ irequests: List[Dict] = [r for r in irequests_all if r]
- if requests:
- resp = self.db.insert_ingest_request(self.cur, requests)
- self.counts['insert-requests'] += resp[0]
- self.counts['update-requests'] += resp[1]
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
self.db.commit()
return []
-class PersistGrobidWorker(SandcrawlerWorker):
- def __init__(self, db_url, **kwargs):
+class PersistGrobidWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.grobid = GrobidClient()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_only = kwargs.get('s3_only', False)
- self.db_only = kwargs.get('db_only', False)
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
# filter out bad "missing status_code" timeout rows
- missing = [r for r in batch if not r.get('status_code')]
+ missing = [r for r in batch if not r.get("status_code")]
if missing:
- self.counts['skip-missing-status'] += len(missing)
- batch = [r for r in batch if r.get('status_code')]
+ self.counts["skip-missing-status"] += len(missing)
+ batch = [r for r in batch if r.get("status_code")]
for r in batch:
- if r['status_code'] != 200 or not r.get('tei_xml'):
- self.counts['s3-skip-status'] += 1
- if r.get('error_msg'):
- r['metadata'] = {'error_msg': r['error_msg'][:500]}
+ if r["status_code"] != 200 or not r.get("tei_xml"):
+ self.counts["s3-skip-status"] += 1
+ if r.get("error_msg"):
+ r["metadata"] = {"error_msg": r["error_msg"][:500]}
continue
- assert len(r['key']) == 40
+ assert len(r["key"]) == 40
if not self.db_only:
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
+ blob=r["tei_xml"],
+ sha1hex=r["key"],
extension=".tei.xml",
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
- # enhance with teixml2json metadata, if available
+ # enhance with GROBID TEI-XML metadata, if available
try:
metadata = self.grobid.metadata(r)
except xml.etree.ElementTree.ParseError as xml_e:
- r['status'] = 'bad-grobid-xml'
- r['metadata'] = {'error_msg': str(xml_e)[:1024]}
+ r["status"] = "bad-grobid-xml"
+ r["metadata"] = {"error_msg": str(xml_e)[:1024]}
continue
if not metadata:
continue
- for k in ('fatcat_release', 'grobid_version'):
+ for k in ("fatcat_release", "grobid_version"):
r[k] = metadata.pop(k, None)
- if r.get('fatcat_release'):
- r['fatcat_release'] = r['fatcat_release'].replace('release_', '')
- if metadata.get('grobid_timestamp'):
- r['updated'] = metadata['grobid_timestamp']
- r['metadata'] = metadata
+ if r.get("fatcat_release"):
+ r["fatcat_release"] = r["fatcat_release"].replace("release_", "")
+ if metadata.get("grobid_timestamp"):
+ r["updated"] = metadata["grobid_timestamp"]
+ r["metadata"] = metadata
if not self.s3_only:
+ assert self.db and self.cur
resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
- self.counts['insert-grobid'] += resp[0]
- self.counts['update-grobid'] += resp[1]
+ self.counts["insert-grobid"] += resp[0]
+ self.counts["update-grobid"] += resp[1]
- file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("file_meta")]
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
@@ -320,11 +441,11 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
This could be refactored into a "Sink" type with an even thinner wrapper.
"""
- def __init__(self, output_dir):
+ def __init__(self, output_dir: str):
super().__init__()
self.output_dir = output_dir
- def _blob_path(self, sha1hex, extension=".tei.xml"):
+ def _blob_path(self, sha1hex: str, extension: str = ".tei.xml") -> str:
obj_path = "{}/{}/{}{}".format(
sha1hex[0:2],
sha1hex[2:4],
@@ -333,48 +454,49 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
)
return obj_path
- def process(self, record, key=None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
- if record.get('status_code') != 200 or not record.get('tei_xml'):
+ if record.get("status_code") != 200 or not record.get("tei_xml"):
return False
- assert(len(record['key'])) == 40
- p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))
+ assert (len(record["key"])) == 40
+ p = "{}/{}".format(self.output_dir, self._blob_path(record["key"]))
os.makedirs(os.path.dirname(p), exist_ok=True)
- with open(p, 'w') as f:
- f.write(record.pop('tei_xml'))
- self.counts['written'] += 1
+ with open(p, "w") as f:
+ f.write(record.pop("tei_xml"))
+ self.counts["written"] += 1
return record
class PersistPdfTrioWorker(SandcrawlerWorker):
-
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
- batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')]
+ batch = [r for r in batch if "pdf_trio" in r and r["pdf_trio"].get("status_code")]
for r in batch:
# copy key (sha1hex) into sub-object
- r['pdf_trio']['key'] = r['key']
- pdftrio_batch = [r['pdf_trio'] for r in batch]
+ r["pdf_trio"]["key"] = r["key"]
+ pdftrio_batch = [r["pdf_trio"] for r in batch]
resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
- self.counts['insert-pdftrio'] += resp[0]
- self.counts['update-pdftrio'] += resp[1]
-
- file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')]
+ self.counts["insert-pdftrio"] += resp[0]
+ self.counts["update-pdftrio"] += resp[1]
+
+ file_meta_batch = [
+ r["file_meta"]
+ for r in batch
+ if r["pdf_trio"]["status"] == "success" and r.get("file_meta")
+ ]
resp = self.db.insert_file_meta(self.cur, file_meta_batch)
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
return []
@@ -387,63 +509,63 @@ class PersistPdfTextWorker(SandcrawlerWorker):
Should keep batch sizes small.
"""
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_only = kwargs.get('s3_only', False)
- self.db_only = kwargs.get('db_only', False)
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
parsed_batch = []
for r in batch:
parsed_batch.append(PdfExtractResult.from_pdftext_dict(r))
for r in parsed_batch:
- if r.status != 'success' or not r.text:
- self.counts['s3-skip-status'] += 1
+ if r.status != "success" or not r.text:
+ self.counts["s3-skip-status"] += 1
if r.error_msg:
- r.metadata = {'error_msg': r.error_msg[:500]}
+ r.metadata = {"error_msg": r.error_msg[:500]}
continue
assert len(r.sha1hex) == 40
if not self.db_only:
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder="text",
blob=r.text,
sha1hex=r.sha1hex,
extension=".txt",
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
if not self.s3_only:
- resp = self.db.insert_pdf_meta(self.cur, parsed_batch, on_conflict="update")
- self.counts['insert-pdf-meta'] += resp[0]
- self.counts['update-pdf-meta'] += resp[1]
+ assert self.db and self.cur
+ rows = [r.to_sql_tuple() for r in parsed_batch]
+ resp = self.db.insert_pdf_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-pdf-meta"] += resp[0]
+ self.counts["update-pdf-meta"] += resp[1]
file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta]
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
@@ -452,38 +574,212 @@ class PersistPdfTextWorker(SandcrawlerWorker):
class PersistThumbnailWorker(SandcrawlerWorker):
"""
- Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table.
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL
+ table.
- This worker *must* be used with raw kakfa mode.
+ This worker *must* be used with raw kakfa mode; thumbnails are *not*
+ wrapped in JSON like most sandcrawler kafka messages.
"""
def __init__(self, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_extension = kwargs.get('s3_extension', ".jpg")
- self.s3_folder = kwargs.get('s3_folder', "pdf")
+ self.s3_extension = kwargs.get("s3_extension", ".jpg")
+ self.s3_folder = kwargs.get("s3_folder", "pdf")
- def process(self, blob: bytes, key: Optional[str] = None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
"""
Processing raw messages, not decoded JSON objects
"""
+ assert isinstance(record, bytes)
+ blob: bytes = record
if isinstance(key, bytes):
- key = key.decode('utf-8')
+ key = key.decode("utf-8")
assert key is not None and len(key) == 40 and isinstance(key, str)
- assert isinstance(blob, bytes)
assert len(blob) >= 50
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
blob=blob,
sha1hex=key,
extension=self.s3_extension,
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
+
+
+class GenericPersistDocWorker(SandcrawlerWorker):
+ """
+ Pushes blobs from Kafka to S3.
+ Objects are assumed to be JSON-wrapped strings.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
+ )
+ self.s3_extension = kwargs.get("s3_extension", ".unknown")
+ self.s3_folder = kwargs.get("s3_folder", "unknown")
+ self.doc_key = "unknown"
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+
+ if record.get("status") != "success" or not record.get(self.doc_key):
+ return
+
+ assert key is not None
+ if isinstance(key, bytes):
+ key_str = key.decode("utf-8")
+ elif isinstance(key, str):
+ key_str = key
+ assert len(key_str) == 40
+ if "sha1hex" in record:
+ assert key_str == record["sha1hex"]
+
+ self.s3.put_blob(
+ folder=self.s3_folder,
+ blob=record[self.doc_key].encode("utf-8"),
+ sha1hex=key_str,
+ extension=self.s3_extension,
+ )
+ self.counts["s3-put"] += 1
+
+
+class PersistXmlDocWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get("s3_extension", ".jats.xml")
+ self.s3_folder = kwargs.get("s3_folder", "xml_doc")
+ self.doc_key = "jats_xml"
+
+
+class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get("s3_extension", ".tei.xml")
+ self.s3_folder = kwargs.get("s3_folder", "html_body")
+ self.doc_key = "tei_xml"
+
+
+class PersistCrossrefWorker(SandcrawlerWorker):
+ """
+ Pushes Crossref API JSON records into postgresql. Can also talk to GROBID,
+ parsed 'unstructured' references, and push the results in to postgresql at
+ the same time.
+ """
+
+ def __init__(
+ self,
+ db_url: str,
+ grobid_client: Optional[GrobidClient],
+ parse_refs: bool = True,
+ **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ if grobid_client:
+ self.grobid_client = grobid_client
+ else:
+ self.grobid_client = GrobidClient()
+ self.parse_refs = parse_refs
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ crossref_batch = []
+ refs_batch = []
+ for record in batch:
+ crossref_batch.append(
+ dict(
+ doi=record["DOI"].lower().strip(),
+ indexed=record["indexed"]["date-time"],
+ record=record,
+ )
+ )
+ if self.parse_refs:
+ try:
+ parsed_refs = self.grobid_client.crossref_refs(record)
+ refs_batch.append(parsed_refs)
+ except (
+ xml.etree.ElementTree.ParseError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.ReadTimeout,
+ ):
+ print("GROBID crossref refs parsing error, skipping with a sleep")
+ time.sleep(3)
+ pass
+
+ resp = self.db.insert_crossref(self.cur, crossref_batch)
+ if len(crossref_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(crossref_batch)
+ self.counts["insert-crossref"] += resp[0]
+ self.counts["update-crossref"] += resp[1]
+
+ if refs_batch:
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistGrobidRefsWorker(SandcrawlerWorker):
+ """
+ Simple persist worker to backfill GROBID references in to postgresql
+ locally. Consumes the JSON output from GROBID CrossrefRefsWorker.
+ """
+
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ refs_batch = []
+ for record in batch:
+ assert record["source"]
+ assert record["source_id"]
+ refs_batch.append(record)
+
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 4a1d7a4..356f050 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,22 @@
-
-import sys
import json
-import time
+import multiprocessing.pool
import signal
+import sys
+import time
import zipfile
-import requests
-import multiprocessing.pool
from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
+from typing import Any, Dict, List, Optional, Sequence
+
+from confluent_kafka import Consumer, KafkaException, Producer
-from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, PetaboxError
+from .ia import (
+ PetaboxError,
+ SandcrawlerBackoffError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .misc import parse_cdx_line, requests_retry_session
class SandcrawlerWorker(object):
@@ -21,31 +27,30 @@ class SandcrawlerWorker(object):
worker (pipeline-style), or defaults to stdout.
"""
- def __init__(self):
- self.counts = Counter()
- self.sink = None
- # TODO: self.counters
+ def __init__(self, sink: Optional["SandcrawlerWorker"] = None):
+ self.counts: Counter = Counter()
+ self.sink: Optional[SandcrawlerWorker] = sink
- def push_record(self, task, key=None):
- self.counts['total'] += 1
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if not self.want(task):
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
result = self.process(task, key=key)
if not result:
- self.counts['failed'] += 1
+ self.counts["failed"] += 1
return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return result
- def timeout_response(self, task):
+ def timeout_response(self, task: Any) -> Any:
"""
This should be overridden by workers that want to return something
meaningful when there is a processing timeout. Eg, JSON vs some other
@@ -53,7 +58,9 @@ class SandcrawlerWorker(object):
"""
return None
- def push_record_timeout(self, task, key=None, timeout=300):
+ def push_record_timeout(
+ self, task: Any, key: Optional[str] = None, timeout: int = 300
+ ) -> Any:
"""
A wrapper around self.push_record which sets a timeout.
@@ -62,49 +69,52 @@ class SandcrawlerWorker(object):
same process.
"""
- def timeout_handler(signum, frame):
+ def timeout_handler(signum: int, frame: Any) -> None:
raise TimeoutError("timeout processing record")
+
signal.signal(signal.SIGALRM, timeout_handler)
resp = None
signal.alarm(int(timeout))
try:
resp = self.push_record(task, key=key)
except TimeoutError:
- self.counts['timeout'] += 1
- resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+ self.counts["timeout"] += 1
+ resp = self.timeout_response(task) # pylint: disable=assignment-from-none
# TODO: what if it is this push_record() itself that is timing out?
if resp and self.sink:
self.sink.push_record(resp)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
elif resp:
print(json.dumps(resp))
finally:
signal.alarm(0)
return resp
- def push_batch(self, tasks):
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
results = []
for task in tasks:
results.append(self.push_record(task))
return results
- def finish(self):
+ def finish(self) -> Counter:
if self.sink:
self.sink.finish()
print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
- def want(self, task):
+ def want(self, task: Any) -> bool:
"""
Optionally override this as a filter in implementations.
"""
return True
- def process(self, task, key=None):
+ def process(self, task: Any, key: Optional[str] = None) -> Any:
"""
Derived workers need to implement business logic here.
+
+ TODO: should derived workers explicitly type-check the 'task' object?
"""
- raise NotImplementedError('implementation required')
+ raise NotImplementedError("implementation required")
class SandcrawlerFetchWorker(SandcrawlerWorker):
@@ -113,118 +123,130 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
PDFs) from wayback, archive.org, or other sources.
"""
- def __init__(self, wayback_client, **kwargs):
+ def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
super().__init__(**kwargs)
self.wayback_client = wayback_client
+ self.http_session = requests_retry_session()
- def fetch_blob(self, record):
- start_process = time.time()
- default_key = record['sha1hex']
+ def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = record["sha1hex"]
wayback_sec = None
petabox_sec = None
- if record.get('warc_path') and record.get('warc_offset'):
+ if record.get("warc_path") and record.get("warc_offset"):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
- raise Exception("wayback client not configured for this PdfTrioWorker")
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
- blob = self.wayback_client.fetch_petabox_body(
- csize=record['warc_csize'],
- offset=record['warc_offset'],
- warc_path=record['warc_path'],
+ blob: bytes = self.wayback_client.fetch_petabox_body(
+ csize=record["warc_csize"],
+ offset=record["warc_offset"],
+ warc_path=record["warc_path"],
)
wayback_sec = time.time() - start
- except (WaybackError, PetaboxError) as we:
+ except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
return dict(
key=default_key,
source=record,
- pdf_trio=dict(
- status="error-wayback",
- error_msg=str(we),
- ),
+ status="error-wayback",
+ error_msg=str(we),
)
- elif record.get('url') and record.get('datetime'):
+ elif record.get("url") and record.get("datetime"):
# it's a partial CDX dict or something? fetch using WaybackClient
if not self.wayback_client:
- raise Exception("wayback client not configured for this PdfTrioWorker")
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
blob = self.wayback_client.fetch_replay_body(
- url=record['url'],
- datetime=record['datetime'],
+ url=record["url"],
+ datetime=record["datetime"],
)
wayback_sec = time.time() - start
- except WaybackError as we:
+ except (WaybackError, WaybackContentError) as we:
return dict(
key=default_key,
source=record,
- pdf_trio=dict(
- status="error-wayback",
- error_msg=str(we),
- ),
+ status="error-wayback",
+ error_msg=str(we),
)
- elif record.get('item') and record.get('path'):
+ elif record.get("item") and record.get("path"):
# it's petabox link; fetch via HTTP
start = time.time()
- resp = requests.get("https://archive.org/serve/{}/{}".format(
- record['item'], record['path']))
+ ia_resp = self.http_session.get(
+ "https://archive.org/serve/{}/{}".format(record["item"], record["path"])
+ )
petabox_sec = time.time() - start
try:
- resp.raise_for_status()
+ ia_resp.raise_for_status()
except Exception as e:
return dict(
key=default_key,
source=record,
- pdf_trio=dict(
- status="error-petabox",
- error_msg=str(e),
- ),
+ status="error-petabox",
+ error_msg=str(e),
)
- blob = resp.content
+ blob = ia_resp.content
else:
- raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
- assert blob
+ raise ValueError(
+ "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed"
+ )
+ if not blob:
+ return dict(
+ key=default_key,
+ source=record,
+ status="empty-blob",
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
+ )
return dict(
key=default_key,
status="success",
source=record,
blob=blob,
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
-class MultiprocessWrapper(SandcrawlerWorker):
- def __init__(self, worker, sink, jobs=None):
+class MultiprocessWrapper(SandcrawlerWorker):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ sink: Optional[SandcrawlerWorker] = None,
+ jobs: Optional[int] = None,
+ ):
self.counts = Counter()
self.worker = worker
self.sink = sink
self.pool = multiprocessing.pool.Pool(jobs)
- def push_batch(self, tasks):
- self.counts['total'] += len(tasks)
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ self.counts["total"] += len(tasks)
print("... processing batch of: {}".format(len(tasks)), file=sys.stderr)
results = self.pool.map(self.worker.process, tasks)
for result in results:
if not result:
- self.counts['failed'] += 1
- return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ self.counts["failed"] += 1
+ return []
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return results
- def finish(self):
+ def finish(self) -> Counter:
self.pool.terminate()
if self.sink:
self.sink.finish()
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
- return worker_counts
+ return self.counts
+
class BlackholeSink(SandcrawlerWorker):
"""
@@ -233,73 +255,73 @@ class BlackholeSink(SandcrawlerWorker):
Useful for tests.
"""
- def push_record(self, task, key=None):
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
return
- def push_batch(self, tasks):
- return
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ return []
-class KafkaSink(SandcrawlerWorker):
- def __init__(self, kafka_hosts, produce_topic, **kwargs):
+class KafkaSink(SandcrawlerWorker):
+ def __init__(self, kafka_hosts: str, produce_topic: str, **kwargs):
self.sink = None
self.counts = Counter()
self.produce_topic = produce_topic
self.kafka_hosts = kafka_hosts
- config = self.producer_config({
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 30000000, # ~30 MBytes; broker is ~50 MBytes
- 'api.version.request': True,
- 'api.version.fallback.ms': 0,
- })
+ config = self.producer_config(
+ {
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 30000000, # ~30 MBytes; broker is ~50 MBytes
+ "api.version.request": True,
+ "api.version.fallback.ms": 0,
+ }
+ )
self.producer = Producer(config)
-
@staticmethod
- def _fail_fast(err, msg):
+ def _fail_fast(err: Any, msg: Any) -> None:
if err is not None:
print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: dict) -> dict:
config = kafka_config.copy()
- config.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'message.timeout.ms': 30000,
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
- def push_record(self, msg, key=None):
- self.counts['total'] += 1
+ def push_record(self, msg: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if type(msg) == dict:
- if not key and 'key' in msg:
- key = msg['key']
+ if not key and "key" in msg:
+ key = msg["key"]
msg = json.dumps(msg)
if type(msg) == str:
- msg = msg.encode('utf-8')
+ msg = msg.encode("utf-8")
assert type(msg) == bytes
- self.producer.produce(
- self.produce_topic,
- msg,
- key=key,
- on_delivery=self._fail_fast)
- self.counts['produced'] += 1
+ self.producer.produce(self.produce_topic, msg, key=key, on_delivery=self._fail_fast)
+ self.counts["produced"] += 1
# check for errors etc
self.producer.poll(0)
- def push_batch(self, msgs):
+ def push_batch(self, msgs: List[Any]) -> List[Any]:
for m in msgs:
self.push_record(m)
+ return []
- def finish(self):
+ def finish(self) -> Counter:
self.producer.flush()
return self.counts
@@ -309,19 +331,21 @@ class KafkaCompressSink(KafkaSink):
Variant of KafkaSink for large documents. Used for, eg, GROBID output.
"""
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: Dict[str, Any]) -> Dict[str, Any]:
config = kafka_config.copy()
- config.update({
- 'compression.codec': 'gzip',
- 'retry.backoff.ms': 250,
- 'linger.ms': 1000,
- 'batch.num.messages': 50,
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'message.timeout.ms': 30000,
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "compression.codec": "gzip",
+ "retry.backoff.ms": 250,
+ "linger.ms": 1000,
+ "batch.num.messages": 50,
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
@@ -331,11 +355,11 @@ class RecordPusher:
trivial interface, just wraps an importer and pushes records in to it.
"""
- def __init__(self, worker, **kwargs):
- self.counts = Counter()
- self.worker = worker
+ def __init__(self, worker: SandcrawlerWorker, **kwargs):
+ self.counts: Counter = Counter()
+ self.worker: SandcrawlerWorker = worker
- def run(self):
+ def run(self) -> Counter:
"""
This will look something like:
@@ -348,133 +372,140 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
-
- def __init__(self, worker, json_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, json_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.json_file = json_file
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.json_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
try:
record = json.loads(line)
except json.decoder.JSONDecodeError:
- self.counts['error-json-decode'] += 1
+ self.counts["error-json-decode"] += 1
continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class CdxLinePusher(RecordPusher):
-
- def __init__(self, worker, cdx_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, cdx_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.cdx_file = cdx_file
- self.filter_http_statuses = kwargs.get('filter_http_statuses', None)
- self.filter_mimetypes = kwargs.get('filter_mimetypes', None)
- self.allow_octet_stream = kwargs.get('allow_octet_stream', False)
- self.batch_size = kwargs.get('batch_size', None)
+ self.filter_http_statuses = kwargs.get("filter_http_statuses", None)
+ self.filter_mimetypes = kwargs.get("filter_mimetypes", None)
+ self.allow_octet_stream = kwargs.get("allow_octet_stream", False)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.cdx_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
record = parse_cdx_line(line, normalize=True)
if not record:
- self.counts['skip-parse'] += 1
+ self.counts["skip-parse"] += 1
continue
- if self.filter_http_statuses and record['http_status'] not in self.filter_http_statuses:
- self.counts['skip-http_status'] += 1
+ if (
+ self.filter_http_statuses
+ and record["http_status"] not in self.filter_http_statuses
+ ):
+ self.counts["skip-http_status"] += 1
continue
- if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes:
- self.counts['skip-mimetype'] += 1
+ if self.filter_mimetypes and record["mimetype"] not in self.filter_mimetypes:
+ self.counts["skip-mimetype"] += 1
continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class ZipfilePusher(RecordPusher):
-
- def __init__(self, worker, zipfile_path, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, zipfile_path: str, **kwargs):
self.counts = Counter()
self.worker = worker
self.filter_suffix = ".pdf"
self.zipfile_path = zipfile_path
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
- with zipfile.ZipFile(self.zipfile_path, 'r') as archive:
+ with zipfile.ZipFile(self.zipfile_path, "r") as archive:
for zipinfo in archive.infolist():
if not zipinfo.filename.endswith(self.filter_suffix):
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
# NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it
- flo = archive.open(zipinfo, 'r')
+ flo = archive.open(zipinfo, "r")
data = flo.read(2**32)
flo.close()
if self.batch_size:
batch.append(data)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(data)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
-class KafkaJsonPusher(RecordPusher):
- def __init__(self, worker, kafka_hosts, consume_topic, group, **kwargs):
+class KafkaJsonPusher(RecordPusher):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ kafka_hosts: str,
+ consume_topic: str,
+ group: str,
+ **kwargs
+ ):
self.counts = Counter()
self.worker = worker
self.consumer = make_kafka_consumer(
@@ -482,28 +513,32 @@ class KafkaJsonPusher(RecordPusher):
consume_topic,
group,
)
- self.push_batches = kwargs.get('push_batches', False)
- self.raw_records = kwargs.get('raw_records', False)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.batch_size = kwargs.get('batch_size', 100)
+ self.push_batches = kwargs.get("push_batches", False)
+ self.raw_records = kwargs.get("raw_records", False)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.batch_size = kwargs.get("batch_size", 100)
if self.batch_size in (0, 1):
self.batch_size = 1
- self.batch_worker = kwargs.get('batch_worker', False)
+ self.batch_worker = kwargs.get("batch_worker", False)
+ self.process_timeout_sec = kwargs.get("process_timeout_sec", 300)
- def run(self):
+ def run(self) -> Counter:
while True:
# TODO: this is batch-oriented, because underlying worker is
# often batch-oriented, but this doesn't confirm that entire batch
- # has been pushed to fatcat before commiting offset. Eg, consider
+ # has been pushed to fatcat before committing offset. Eg, consider
# case where there there is one update and thousands of creates;
# update would be lingering in worker, and if worker crashed
# never created. Not great.
batch = self.consumer.consume(
- num_messages=self.batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval)".format(
- len(batch), self.poll_interval),
- file=sys.stderr)
+ num_messages=self.batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval)".format(
+ len(batch), self.poll_interval
+ ),
+ file=sys.stderr,
+ )
if not batch:
# TODO: could have some larger timeout here and
# self.worker.finish() if it's been more than, eg, a couple
@@ -515,14 +550,14 @@ class KafkaJsonPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
if self.push_batches:
- self.counts['total'] += len(batch)
- records = [json.loads(msg.value().decode('utf-8')) for msg in batch]
+ self.counts["total"] += len(batch)
+ records = [json.loads(msg.value().decode("utf-8")) for msg in batch]
self.worker.push_batch(records)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
else:
for msg in batch:
- self.counts['total'] += 1
+ self.counts["total"] += 1
if self.raw_records:
# In this mode, pass the Kafka message as bytes through
# without decoding as JSON. Eg, for thumbnails (where
@@ -530,7 +565,7 @@ class KafkaJsonPusher(RecordPusher):
# from the message)
record = msg.value()
else:
- record = json.loads(msg.value().decode('utf-8'))
+ record = json.loads(msg.value().decode("utf-8"))
# This complex bit of code implements backoff/backpressure
# in a way that will not cause this Kafka consumer to lose
# partition assignments (resulting in a rebalance). This
@@ -540,7 +575,9 @@ class KafkaJsonPusher(RecordPusher):
while not done:
try:
# use timeouts; don't want kafka itself to timeout
- self.worker.push_record_timeout(record, key=msg.key(), timeout=300)
+ self.worker.push_record_timeout(
+ record, key=msg.key(), timeout=self.process_timeout_sec
+ )
break
except SandcrawlerBackoffError as be:
print("Backing off for 200 seconds: {}".format(be))
@@ -552,8 +589,8 @@ class KafkaJsonPusher(RecordPusher):
assert not empty_batch
time.sleep(5)
self.consumer.resume(self.consumer.assignment())
- self.counts['pushed'] += 1
- if self.counts['total'] % 500 == 0:
+ self.counts["pushed"] += 1
+ if self.counts["total"] % 500 == 0:
print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
for msg in batch:
# locally store offsets of processed messages; will be
@@ -562,16 +599,16 @@ class KafkaJsonPusher(RecordPusher):
# TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
# commit the current batch if it has been lingering
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
self.consumer.close()
return self.counts
-def make_kafka_consumer(hosts, consume_topic, group):
+def make_kafka_consumer(hosts: str, consume_topic: str, group: str) -> Consumer:
topic_name = consume_topic
- def fail_fast(err, partitions):
+ def fail_fast(err: Any, partitions: List[Any]) -> None:
if err is not None:
print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
print("Bailing out...", file=sys.stderr)
@@ -584,40 +621,41 @@ def make_kafka_consumer(hosts, consume_topic, group):
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(p.error)
- #print("Kafka consumer commit successful")
+ # print("Kafka consumer commit successful")
pass
# previously, using pykafka
- #auto_commit_enable=True,
- #auto_commit_interval_ms=30000, # 30 seconds
+ # auto_commit_enable=True,
+ # auto_commit_interval_ms=30000, # 30 seconds
conf = {
- 'bootstrap.servers': hosts,
- 'group.id': group,
- 'on_commit': fail_fast,
+ "bootstrap.servers": hosts,
+ "group.id": group,
+ "on_commit": fail_fast,
# messages don't have offset marked as stored until processed,
# but we do auto-commit stored offsets to broker
- 'enable.auto.offset.store': False,
- 'enable.auto.commit': True,
+ "enable.auto.offset.store": False,
+ "enable.auto.commit": True,
# user code timeout; if no poll after this long, assume user code
# hung and rebalance (default: 6min)
- 'max.poll.interval.ms': 360000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
+ "max.poll.interval.ms": 360000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
},
}
- def on_rebalance(consumer, partitions):
+ def on_rebalance(consumer: Any, partitions: List[Any]) -> None:
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions),
- file=sys.stderr)
+ print(
+ "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr
+ )
consumer = Consumer(conf)
# NOTE: it's actually important that topic_name *not* be bytes (UTF-8
# encoded)
- consumer.subscribe([topic_name],
+ consumer.subscribe(
+ [topic_name],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
new file mode 100644
index 0000000..83d53d4
--- /dev/null
+++ b/python/sandcrawler/xml.py
@@ -0,0 +1,6 @@
+import xml.etree.ElementTree as ET
+
+
+def xml_reserialize(raw: bytes) -> str:
+ root = ET.fromstring(raw)
+ return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode")
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 0fd0194..aebcbe1 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -1,25 +1,23 @@
#!/usr/bin/env python3
-
"""
These are generally for continuously running workers that consume from Kafka.
Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
-or minio.
+or S3 (SeaweedFS).
"""
+import argparse
import os
+import subprocess
import sys
-import argparse
-import datetime
-import raven
-from sandcrawler import *
+import sentry_sdk
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-try:
- git_sha = raven.fetch_git_sha('..')
-except Exception as e:
- git_sha = None
-sentry_client = raven.Client(release=git_sha)
+from sandcrawler import *
+from sandcrawler.persist import (
+ PersistCrossrefWorker,
+ PersistHtmlTeiXmlWorker,
+ PersistXmlDocWorker,
+)
def run_grobid_extract(args):
@@ -49,13 +47,14 @@ def run_grobid_extract(args):
)
pusher.run()
+
def run_pdf_extract(args):
consume_topic = "sandcrawler-{}.unextracted".format(args.env)
- text_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
- text_sink = KafkaCompressSink(
+ pdftext_sink = KafkaCompressSink(
kafka_hosts=args.kafka_hosts,
- produce_topic=text_topic,
+ produce_topic=pdftext_topic,
)
thumbnail_sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
@@ -66,7 +65,7 @@ def run_pdf_extract(args):
)
worker = PdfExtractWorker(
wayback_client=wayback_client,
- sink=text_sink,
+ sink=pdftext_sink,
thumbnail_sink=thumbnail_sink,
)
pusher = KafkaJsonPusher(
@@ -75,9 +74,11 @@ def run_pdf_extract(args):
consume_topic=consume_topic,
group="pdf-extract",
batch_size=1,
+ push_timeout_sec=120,
)
pusher.run()
+
def run_persist_grobid(args):
consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
worker = PersistGrobidWorker(
@@ -92,6 +93,8 @@ def run_persist_grobid(args):
kafka_group = "persist-grobid"
if args.s3_only:
kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
@@ -102,6 +105,7 @@ def run_persist_grobid(args):
)
pusher.run()
+
def run_persist_pdftext(args):
consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
worker = PersistPdfTextWorker(
@@ -116,6 +120,8 @@ def run_persist_pdftext(args):
kafka_group = "persist-pdf-text"
if args.s3_only:
kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
@@ -126,6 +132,7 @@ def run_persist_pdftext(args):
)
pusher.run()
+
def run_persist_thumbnail(args):
consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
worker = PersistThumbnailWorker(
@@ -136,17 +143,65 @@ def run_persist_thumbnail(args):
s3_extension=".180px.jpg",
s3_folder="pdf",
)
+ kafka_group = "persist-pdf-thumbnail"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-pdf-thumbnail",
+ group=kafka_group,
push_batches=False,
raw_records=True,
batch_size=25,
)
pusher.run()
+
+def run_persist_xml_doc(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.xml-doc"
+ worker = PersistXmlDocWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ kafka_group = "persist-xml-doc"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+
+def run_persist_html_teixml(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.html-teixml"
+ worker = PersistHtmlTeiXmlWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ kafka_group = "persist-html-teixml"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+
def run_persist_pdftrio(args):
consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
worker = PersistPdfTrioWorker(
@@ -162,15 +217,26 @@ def run_persist_pdftrio(args):
)
pusher.run()
+
def run_ingest_file(args):
+ spn_cdx_retry_sec = 9.0
if args.bulk:
consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env)
consume_topic = "sandcrawler-{}.ingest-file-requests-bulk".format(args.env)
+ elif args.priority:
+ spn_cdx_retry_sec = 45.0
+ consume_group = "sandcrawler-{}-ingest-file-priority".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-priority".format(args.env)
else:
+ spn_cdx_retry_sec = 1.0
consume_group = "sandcrawler-{}-ingest-file".format(args.env)
- consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic,
@@ -182,12 +248,33 @@ def run_ingest_file(args):
grobid_client = GrobidClient(
host_url=args.grobid_host,
)
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
worker = IngestFileWorker(
grobid_client=grobid_client,
sink=sink,
grobid_sink=grobid_sink,
- # don't SPNv2 for --bulk backfill
- try_spn2=not args.bulk,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ # don't SPNv2 for --bulk or --skip-spn
+ try_spn2=not (args.bulk or args.skip_spn),
+ spn_cdx_retry_sec=spn_cdx_retry_sec,
)
pusher = KafkaJsonPusher(
worker=worker,
@@ -198,6 +285,7 @@ def run_ingest_file(args):
)
pusher.run()
+
def run_persist_ingest_file(args):
consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
worker = PersistIngestFileResultWorker(
@@ -213,88 +301,195 @@ def run_persist_ingest_file(args):
)
pusher.run()
+
+def run_persist_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ consume_topic = "fatcat-{}.api-crossref".format(args.env)
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-crossref",
+ push_batches=True,
+ # small batch size because doing GROBID processing
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
- parser.add_argument('--db-url',
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--kafka-group-suffix", default="", help="Kafka consumer group suffix (optional)"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ parser.add_argument(
+ "--db-url",
help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (minio) backend URL",
- default="localhost:9000")
- parser.add_argument('--s3-access-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_ACCESS_KEY'))
- parser.add_argument('--s3-secret-key',
- help="S3 (minio) credential",
- default=os.environ.get('MINIO_SECRET_KEY'))
- parser.add_argument('--s3-bucket',
- help="S3 (minio) bucket to persist into",
- default="sandcrawler-dev")
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_SECRET_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
subparsers = parser.add_subparsers()
- sub_grobid_extract = subparsers.add_parser('grobid-extract',
- help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka")
+ sub_grobid_extract = subparsers.add_parser(
+ "grobid-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka",
+ )
sub_grobid_extract.set_defaults(func=run_grobid_extract)
- sub_pdf_extract = subparsers.add_parser('pdf-extract',
- help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka")
+ sub_pdf_extract = subparsers.add_parser(
+ "pdf-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka",
+ )
sub_pdf_extract.set_defaults(func=run_pdf_extract)
- sub_persist_grobid = subparsers.add_parser('persist-grobid',
- help="daemon that consumes GROBID output from Kafka and pushes to minio and postgres")
- sub_persist_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_persist_grobid.add_argument('--db-only',
- action='store_true',
- help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_grobid = subparsers.add_parser(
+ "persist-grobid",
+ help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
sub_persist_grobid.set_defaults(func=run_persist_grobid)
- sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
- help="daemon that consumes pdftext output from Kafka and pushes to minio and postgres")
- sub_persist_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_persist_pdftext.add_argument('--db-only',
- action='store_true',
- help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_pdftext = subparsers.add_parser(
+ "persist-pdftext",
+ help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
- sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
- help="daemon that consumes thumbnail output from Kafka and pushes to minio and postgres")
+ sub_persist_thumbnail = subparsers.add_parser(
+ "persist-thumbnail",
+ help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
- sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
- help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
+ sub_persist_xml_doc = subparsers.add_parser(
+ "persist-xml-doc",
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
+ sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
+
+ sub_persist_html_teixml = subparsers.add_parser(
+ "persist-html-teixml",
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
+ sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
+
+ sub_persist_pdftrio = subparsers.add_parser(
+ "persist-pdftrio",
+ help="daemon that consumes pdftrio output from Kafka and pushes to postgres",
+ )
sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
- sub_ingest_file = subparsers.add_parser('ingest-file',
- help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
- sub_ingest_file.add_argument('--bulk',
- action='store_true',
- help="consume from bulk kafka topic (eg, for ingest backfill)")
+ sub_ingest_file = subparsers.add_parser(
+ "ingest-file",
+ help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka",
+ )
+ sub_ingest_file.add_argument(
+ "--bulk",
+ action="store_true",
+ help="consume from bulk kafka topic (eg, for ingest backfill)",
+ )
+ sub_ingest_file.add_argument(
+ "--skip-spn",
+ action="store_true",
+ help="don't do SPN lookups",
+ )
+ sub_ingest_file.add_argument(
+ "--priority",
+ action="store_true",
+ help="consume from priority kafka topic (eg, for SPN requests)",
+ )
sub_ingest_file.set_defaults(func=run_ingest_file)
- sub_persist_ingest_file = subparsers.add_parser('persist-ingest-file',
- help="daemon that consumes ingest-file output from Kafka and pushes to postgres")
+ sub_persist_ingest_file = subparsers.add_parser(
+ "persist-ingest-file",
+ help="daemon that consumes ingest-file output from Kafka and pushes to postgres",
+ )
sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file)
+ sub_persist_crossref = subparsers.add_parser(
+ "persist-crossref",
+ help="daemon that persists crossref to postgres; also does GROBID ref transform",
+ )
+ sub_persist_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_persist_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+ sub_persist_crossref.set_defaults(func=run_persist_crossref)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..4561541 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is intended to be used for backfill ingest of old crawls. It can
also be used as a fast path for getting freshly crawled content into fatcat if
@@ -12,9 +11,9 @@ Run like:
Can then run through requests using that tool, or dump into kafka queue.
"""
-import sys
-import json
import argparse
+import json
+import sys
def run(args):
@@ -22,51 +21,54 @@ def run(args):
if not l.strip():
continue
row = json.loads(l)
- if not row['hit']:
+ if not row["hit"]:
continue
request = {
- 'base_url': row['final_url'],
- 'ingest_type': args.ingest_type,
- 'link_source': args.link_source,
- 'link_source_id': row['identifier'],
- 'ingest_request_source': args.ingest_request_source,
- 'ext_ids': {
- args.extid_type: row['identifier'],
+ "base_url": row["final_url"],
+ "ingest_type": args.ingest_type,
+ "link_source": args.link_source,
+ "link_source_id": row["identifier"],
+ "ingest_request_source": args.ingest_request_source,
+ "ext_ids": {
+ args.extid_type: row["identifier"],
},
}
if args.release_stage:
- assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
- request['release_stage'] = args.release_stage
+ assert args.release_stage in (
+ "published",
+ "submitted",
+ "accepted",
+ "draft",
+ "update",
+ )
+ request["release_stage"] = args.release_stage
print("{}".format(json.dumps(request, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--link-source',
- required=True,
- help="link_source to include in request")
- parser.add_argument('--extid-type',
- required=True,
- help="extid to encode identifier as")
- parser.add_argument('--ingest-type',
- default="pdf",
- help="ingest type (pdf, html, xml, etc)")
- parser.add_argument('--ingest-request-source',
- default="arabesque",
- help="to include in request")
- parser.add_argument('--release-stage',
- default=None,
- help="to include in request")
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--link-source", required=True, help="link_source to include in request"
+ )
+ parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+ parser.add_argument(
+ "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+ )
+ parser.add_argument(
+ "--ingest-request-source", default="arabesque", help="to include in request"
+ )
+ parser.add_argument("--release-stage", default=None, help="to include in request")
+ parser.add_argument(
+ "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..6328f52
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Helper script to
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import json
+import sys
+from typing import Any
+
+import internetarchive
+
+FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+}
+
+
+def want_file(f: dict, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+
+def parse_file(f: dict) -> dict:
+ """
+ Takes an IA API file and turns it in to a fatcat fileset manifest file
+ """
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = {
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
+ }
+ # TODO: will disable this hard check eventually and replace with:
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ mimetype = FORMAT_TO_MIMETYPE[f.format]
+ if mimetype:
+ mf["extra"] = dict(mimetype=mimetype)
+ return mf
+
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+ print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+ if release_id.startswith("release_"):
+ release_id = release_id[9:]
+ assert len(release_id) == 26
+ item = session.get_item(item_name)
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+ fileset = {
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
+ }
+ print(json.dumps(fileset))
+ return fileset
+
+
+def main():
+ session = internetarchive.get_session()
+ if len(sys.argv) == 3:
+ item_name = sys.argv[1]
+ release_id = sys.argv[2]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+ else:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ fields = line.split("\t")
+ assert len(fields) == 2
+ item_name = fields[0]
+ release_id = fields[1]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..0b60da3
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import internetarchive as ia
+import requests
+
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item["identifier"]
+ # TODO: error handling
+ try:
+ ret = ia.download(
+ item,
+ files=[item + ".cdx.gz"],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000,
+ )
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
+
+ # Move and cleanup
+ shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
+
+ print("Done!")
+
+
+if __name__ == "__main__":
+ run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..e3bf4f0 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
@@ -18,38 +18,44 @@ def canon(s):
def transform_cnki(obj):
requests = []
- assert obj['cnki_id']
-
+ assert obj["cnki_id"]
requests = []
- requests.append({
- 'base_url': canon(obj['info_url']),
- 'ingest_type': 'pdf',
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
- if 'read_url' in obj:
- requests.append({
- 'base_url': canon(obj['read_url']),
- 'ingest_type': 'pdf', # actually HTML
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
+ requests.append(
+ {
+ "base_url": canon(obj["info_url"]),
+ "ingest_type": "pdf",
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+ if "read_url" in obj:
+ requests.append(
+ {
+ "base_url": canon(obj["read_url"]),
+ "ingest_type": "pdf", # actually HTML
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
return requests
+
def transform_wanfang(obj):
- assert obj['wanfang_id']
- return [{
- 'base_url': canon(obj['url']),
- 'ingest_type': 'pdf',
- 'link_source': 'wanfang_covid19',
- 'link_source_id': obj['wanfang_id'],
- 'ingest_request_source': 'scrape-covid19',
- }]
+ assert obj["wanfang_id"]
+ return [
+ {
+ "base_url": canon(obj["url"]),
+ "ingest_type": "pdf",
+ "link_source": "wanfang_covid19",
+ "link_source_id": obj["wanfang_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ ]
def run(args):
@@ -58,26 +64,27 @@ def run(args):
continue
row = json.loads(l)
- if 'wanfang_id' in row:
+ if "wanfang_id" in row:
requests = transform_wanfang(row) or []
- elif 'cnki_id' in row:
+ elif "cnki_id" in row:
requests = transform_cnki(row) or []
else:
continue
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="COVID-19 metadata file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,23 +19,20 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -45,81 +42,80 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
- self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
- self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+ self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def run(self, dump_file):
sys.stderr.write("Starting...\n")
for line in dump_file:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
if len(sha1_hex) != 40:
sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
- tei_xml = grobid.get('tei_xml')
+ tei_xml = grobid.get("tei_xml")
if not tei_xml:
print("{}\tskip empty".format(sha1_hex))
- self.count['skip-empty'] += 1
+ self.count["skip-empty"] += 1
continue
- tei_xml = tei_xml.encode('utf-8')
+ tei_xml = tei_xml.encode("utf-8")
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
Body=tei_xml,
StorageClass=self.s3_storage_class,
)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="grobid/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".tei.xml",
- help='file suffix for created objects')
- parser.add_argument('--s3-storage-class',
- type=str,
- default="STANDARD",
- help='AWS S3 storage class (redundancy) to use')
- parser.add_argument('dump_file',
- help="TSV/JSON dump file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix",
+ type=str,
+ default="grobid/",
+ help="key prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--s3-storage-class",
+ type=str,
+ default="STANDARD",
+ help="AWS S3 storage class (redundancy) to use",
+ )
+ parser.add_argument(
+ "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..093f32a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbDisk:
-
def __init__(self, disk_dir, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.disk_dir = disk_dir
- self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
- self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+ self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+ self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Ensuring all 65536 base directories exist...\n")
for i in range(256):
for j in range(256):
- fpath = "{}/{}{:02x}/{:02x}".format(
- self.disk_dir,
- self.disk_prefix,
- i,
- j)
+ fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
os.makedirs(fpath, exist_ok=True)
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# save to disk
fpath = "{}/{}{}/{}/{}{}".format(
- self.disk_dir,
- self.disk_prefix,
- sha1_hex[0:2],
- sha1_hex[2:4],
- sha1_hex,
- self.disk_suffix)
- with open(fpath, 'wb') as f:
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix,
+ )
+ with open(fpath, "wb") as f:
f.write(blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
- self.count['success-disk'] += 1
+ self.count["success-disk"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--disk-dir',
- required=True,
- type=str,
- help='local base directory to save into')
- parser.add_argument('--disk-prefix',
- type=str,
- default="pdf/",
- help='directory prefix for items created in bucket')
- parser.add_argument('--disk-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created files')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--disk-dir", required=True, type=str, help="local base directory to save into"
+ )
+ parser.add_argument(
+ "--disk-prefix",
+ type=str,
+ default="pdf/",
+ help="directory prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..6f37ede 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -33,152 +33,180 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import boto3
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbS3:
-
def __init__(self, s3_bucket, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
- self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
- Body=blob)
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+ Body=blob,
+ )
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="pdf/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created objects')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..aef5c12
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import argparse
+import json
+import sys
+from typing import List, Optional
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ # "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ # large publishers/platforms; may remove in the future
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+ "abstract": [],
+ "doc": [],
+ "": ["pdf"],
+ "doi": ["pdf"],
+ "url": ["pdf"],
+ "fulltext": ["pdf"],
+ "anySimpleType": ["pdf"],
+ "application/pdf": ["pdf"],
+ "html": ["html", "pdf"],
+ "text/html": ["html", "pdf"],
+ "xml": ["xml"],
+}
+
+
+def canon(s: str) -> str:
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj: dict) -> List[dict]:
+ """
+ Transforms from a single DOAJ object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ doaj_id = obj["id"].lower()
+ assert doaj_id
+
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
+ return []
+
+ requests = []
+
+ doi: Optional[str] = None
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
+
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
+ continue
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
+ if not ingest_types:
+ continue
+
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in link["url"].lower():
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(link["url"].strip())
+ except UnicodeEncodeError:
+ continue
+
+ if not base_url or len(base_url) > 1000:
+ continue
+
+ for ingest_type in ingest_types:
+ request = {
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
+ },
+ "edit_extra": {},
+ }
+ requests.append(request)
+
+ return requests
+
+
+def run(args) -> None:
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the
No dependencies (only python3 stdlib)
"""
-import sys
-import json
import base64
+import json
+import sys
+
def run():
for line in sys.stdin:
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 5
- raw_sha1 = line[0].replace('sha1:', '')
+ raw_sha1 = line[0].replace("sha1:", "")
dois = json.loads(line[1])
cdx = json.loads(line[2])
mimetype = line[3]
size = int(line[4])
- sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
obj = dict(
sha1=sha1,
dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
size=size,
- mimetype=mimetype)
+ mimetype=mimetype,
+ )
print(json.dumps(obj))
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py
new file mode 100755
index 0000000..2eb56cb
--- /dev/null
+++ b/python/scripts/fetch_cdx_sha1hex.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+"""
+This is a helper script to take fatcat file entities with partial metadata (eg,
+missing SHA256) and try to find one or more CDX record where the file may be
+found in wayback.
+
+This script uses the sandcrawler library and should be run like:
+
+ head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json
+"""
+
+import base64
+import json
+import sys
+from typing import List, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+from sandcrawler.ia import CdxApiClient, cdx_to_dict
+
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: requests.Session = None,
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+
+def b32_hex(s: str) -> str:
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+
+def get_db_cdx(sha1hex: str, http_session) -> List[dict]:
+ resp = http_session.get(
+ SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex)
+ )
+ resp.raise_for_status()
+ rows = resp.json()
+ return rows or []
+
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+
+def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]:
+
+ params = {
+ "url": url,
+ "output": "json",
+ "matchType": "exact",
+ "limit": 20,
+ # TODO: group-by digest/checksum?
+ # can't filter status because might be warc/revisit
+ # "filter": "statuscode:200",
+ }
+ rows = cdx_api._query_api(params)
+ if not rows:
+ return None
+ for row in rows:
+ if row.sha1hex == sha1hex:
+ return row
+ return None
+
+
+def process_file(fe, session, cdx_api) -> dict:
+ status = "unknown"
+
+ # simple CDX db lookup first
+ cdx_row_list = get_db_cdx(fe["sha1"], http_session=session)
+ if cdx_row_list:
+ return dict(
+ file_entity=fe,
+ cdx_rows=cdx_row_list,
+ status="success-db",
+ )
+
+ original_urls = []
+ for pair in fe["urls"]:
+ u = pair["url"]
+ if not "://web.archive.org/web/" in u:
+ continue
+ seg = u.split("/")
+ assert seg[2] == "web.archive.org"
+ assert seg[3] == "web"
+ if not seg[4].isdigit():
+ continue
+ original_url = "/".join(seg[5:])
+ original_urls.append(original_url)
+
+ if len(original_urls) == 0:
+ return dict(file_entity=fe, status="skip-no-urls")
+
+ found_cdx_rows = []
+ for url in list(set(original_urls)):
+
+ cdx_record = None
+ try:
+ cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api)
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 403:
+ return dict(file_entity=fe, status="fail-cdx-403")
+ else:
+ raise
+ if cdx_record and cdx_record.sha1hex == fe["sha1"]:
+ found_cdx_rows.append(cdx_to_dict(cdx_record))
+
+ if found_cdx_rows:
+ return dict(
+ file_entity=fe,
+ cdx_rows=found_cdx_rows,
+ status="success-api",
+ )
+
+ return dict(
+ file_entity=fe,
+ status="fail-not-found",
+ )
+
+
+def main():
+ session = requests_retry_session()
+ session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+ }
+ )
+ cdx_api = CdxApiClient()
+ for line in sys.stdin:
+ if not line.strip():
+ continue
+ fe = json.loads(line)
+ print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api)))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index c33ab86..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,43 +1,48 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
-with open('title_slug_blacklist.txt', 'r') as f:
- TITLE_BLACKLIST = [l.strip() for l in f]
-
-TITLE_BLACKLIST.extend((
- 'editorial',
- 'advertisement',
- 'bookreviews',
- 'reviews',
- 'nr',
- 'abstractoriginalarticle',
- 'originalarticle',
- 'impactfactor',
- 'articlenumber',
-))
+with open("title_slug_denylist.txt", "r") as f:
+ TITLE_DENYLIST = [l.strip() for l in f]
+
+TITLE_DENYLIST.extend(
+ (
+ "editorial",
+ "advertisement",
+ "bookreviews",
+ "reviews",
+ "nr",
+ "abstractoriginalarticle",
+ "originalarticle",
+ "impactfactor",
+ "articlenumber",
+ )
+)
# The full name can't *entirely* be one of these
-NAME_BLACKLIST = (
- 'phd',
- 'phdstudent',
+NAME_DENYLIST = (
+ "phd",
+ "phdstudent",
)
+
def tokenize(s, remove_whitespace=True):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+ return s.encode("ascii", "replace").decode("utf8").replace("?", "")
+
assert tokenize("Impact Factor: 2.114") == "impactfactor"
-assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
def filter_title(title):
@@ -45,16 +50,16 @@ def filter_title(title):
if len(title) > 500:
return None
title_slug = tokenize(title, remove_whitespace=True)
- if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+ if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
- if title_slug.startswith('nr'):
+ if title_slug.startswith("nr"):
return None
- if title.lower().replace('.', '').startswith('int j '):
+ if title.lower().replace(".", "").startswith("int j "):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
- title.replace(prefix, '')
+ title.replace(prefix, "")
if title.startswith("The Journal of "):
return None
@@ -78,63 +83,84 @@ def filter_title(title):
return None
# too deep subtitling/splitting
- if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
return None
return title
+
def filter_author_name(name):
- name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+ name = name["name"]
+ if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
return None
- return ' '.join([t for t in name.split() if tokenize(t)])
+ return " ".join([t for t in name.split() if tokenize(t)])
+
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
def filter_refs(l):
# TODO:
return l
+
def filter_journal_name(name):
- # same blacklist, for now
+ # same denylist, for now
if not name:
return None
- name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
slug_name = tokenize(name)
- if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
- return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
+ return None
+ for prefix in (
+ "/ ",
+ "~ ",
+ "& ",
+ "© ",
+ "Original Research Article ",
+ "Original Article ",
+ "Research Article ",
+ "Available online www.jocpr.com ",
+ ):
if name.startswith(prefix):
- name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ name = name.replace(prefix, "")
+ for suffix in (
+ " Available online at www.sciarena.com",
+ " Original Article",
+ " Available online at",
+ " ISSN",
+ " ISSUE",
+ ):
if name.endswith(suffix):
- name = name.replace(suffix, '')
+ name = name.replace(suffix, "")
if "====================" in name:
return None
if len(name) > 150:
return None
- return ' '.join(name.split())
+ return " ".join(name.split())
+
def filter_metadata(obj):
- if not (obj.get('title') and obj.get('authors')):
+ if not (obj.get("title") and obj.get("authors")):
return None
- title = filter_title(obj['title'])
+ title = filter_title(obj["title"])
if not title:
- #sys.stderr.write("bad title\n")
+ # sys.stderr.write("bad title\n")
return None
else:
- obj['title'] = title
- obj['authors'] = filter_authors(obj['authors'])
- obj['citations'] = filter_refs(obj['citations'])
- obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+ obj["title"] = title
+ obj["authors"] = filter_authors(obj["authors"])
+ obj["citations"] = filter_refs(obj["citations"])
+ obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
return obj
+
def run(invert=False):
for line in sys.stdin:
- fields = line.split('\t')
+ fields = line.split("\t")
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
@@ -151,9 +177,10 @@ def run(invert=False):
fields[4] = processed
else:
fields[0] = processed
- print('\t'.join(fields))
+ print("\t".join(fields))
elif invert:
print(raw.strip())
-if __name__=="__main__":
+
+if __name__ == "__main__":
run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
@@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50
REQUIRE_AUTHORS = False
+
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -51,7 +53,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -59,20 +61,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, left, right)
def process_group(rows):
@@ -86,10 +90,10 @@ def process_group(rows):
left = json.loads(row[1])
right = json.loads(row[2])
# authors must roughly match
- if not check_authors(left['authors'], right['authors']):
+ if not check_authors(left["authors"], right["authors"]):
continue
# years must match (if defined)
- if left['year'] and right['year'] and left['year'] != right['year']:
+ if left["year"] and right["year"] and left["year"] != right["year"]:
continue
filtered.append((left, right))
@@ -101,8 +105,8 @@ def process_group(rows):
group_ids = set()
for row in filtered[1:]:
(left, right) = row
- l_id = left['fatcat_release']
- r_id = right['fatcat_release']
+ l_id = left["fatcat_release"]
+ r_id = right["fatcat_release"]
releases[l_id] = left
releases[r_id] = right
if not group_ids:
@@ -119,6 +123,7 @@ def process_group(rows):
print(json.dumps([releases[ident] for ident in group_ids]))
+
def run():
last_slug = None
@@ -126,7 +131,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -140,5 +145,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
-import sys
import json
+import sys
# out of 1000
score_threshold = 900
@@ -23,15 +23,16 @@ require_authors = 1
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -44,7 +45,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -52,20 +53,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert not check_authors([], [])
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, grobid, crossref)
def process_group(rows):
@@ -78,20 +81,21 @@ def process_group(rows):
continue
grobid = json.loads(row[1])
crossref = json.loads(row[2])
- if not check_authors(crossref['authors'], grobid['authors']):
- #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ if not check_authors(crossref["authors"], grobid["authors"]):
+ # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
continue
else:
- #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
pass
- sha1 = grobid['sha1']
- doi = crossref['doi'].lower()
+ sha1 = grobid["sha1"]
+ doi = crossref["doi"].lower()
l = keepers.get(sha1, list())
l.append(doi)
keepers[sha1] = l
for sha1, doi_list in keepers.items():
print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
def run():
last_slug = None
@@ -99,7 +103,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -112,5 +116,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..90a0f77 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -10,43 +9,49 @@ Run in bulk like:
ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
-import sys
import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
def parse_hbase(line):
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 2
sha1hex = line[0]
obj = json.loads(line[1])
- tei_xml = obj['tei_xml']
+ tei_xml = obj["tei_xml"]
return sha1hex, tei_xml
+
def parse_pg(line):
obj = json.loads(line)
- return obj['sha1hex'], obj['tei_xml']
+ return obj["sha1hex"], obj["tei_xml"]
+
-def run(mode='hbase'):
+def run(mode="hbase"):
for line in sys.stdin:
- if mode == 'hbase':
+ if mode == "hbase":
sha1hex, tei_xml = parse_hbase(line)
- elif mode == 'pg':
+ elif mode == "pg":
sha1hex, tei_xml = parse_pg(line)
else:
- raise NotImplementedError('parse mode: {}'.format(mode))
+ raise NotImplementedError("parse mode: {}".format(mode))
- obj = teixml2json(tei_xml, encumbered=False)
+ tei_doc = parse_document_xml(tei_xml)
+ tei_doc.remove_encumbered()
+ obj = tei_doc.to_legacy_dict()
affiliations = []
- for author in obj['authors']:
- if author.get('affiliation'):
- affiliations.append(author['affiliation'])
+ for author in obj["authors"]:
+ if author.get("affiliation"):
+ affiliations.append(author["affiliation"])
if affiliations:
# don't duplicate affiliations; only the unique ones
affiliations = list(set([json.dumps(a) for a in affiliations]))
affiliations = [json.loads(a) for a in affiliations]
- print('\t'.join([sha1hex, json.dumps(affiliations)]))
+ print("\t".join([sha1hex, json.dumps(affiliations)]))
+
-if __name__=='__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 3d2e14c..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,69 +1,67 @@
#!/usr/bin/env python3
-import sys
-import json
import datetime
+import json
+import sys
+
+MAX_ABSTRACT_BYTES = 4096
-MAX_ABSTRACT_BYTES=4096
def parse_grobid_json(obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra = dict()
- if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
- abobj = dict(
- mimetype="text/plain",
- language=None,
- content=obj.get('abstract').strip())
+ if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+ abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for a in obj.get('authors', []):
+ for a in obj.get("authors", []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
- if raw.get('title'):
- ref['title'] = raw['title'].strip()
- if raw.get('date'):
+ ref["key"] = raw.get("id")
+ if raw.get("title"):
+ ref["title"] = raw["title"].strip()
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
- ref['year'] = year
+ year = int(raw["date"].strip()[:4])
+ ref["year"] = year
except:
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
extra[key] = raw[key].strip()
- if raw.get('authors'):
- extra['authors'] = [a['name'] for a in raw['authors']]
+ if raw.get("authors"):
+ extra["authors"] = [a["name"] for a in raw["authors"]]
if extra:
extra = dict(grobid=extra)
else:
extra = None
- ref['extra'] = extra
+ ref["extra"] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
- if obj.get('date'):
+ if obj.get("date"):
# TODO: only returns year, ever? how to handle?
- release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+ release_date = datetime.datetime(year=obj["date"], month=1, day=1)
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"].lower()
+ if obj["journal"].get("name"):
+ extra["container_name"] = obj["journal"]["name"]
- extra['is_longtail_oa'] = True
+ extra["is_longtail_oa"] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -73,15 +71,17 @@ def parse_grobid_json(obj):
extra = None
return dict(
- title=obj['title'].strip(),
+ title=obj["title"].strip(),
contribs=contribs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=obj["journal"].get("publisher"),
+ volume=obj["journal"].get("volume"),
+ issue=obj["journal"].get("issue"),
abstracts=abstracts,
release_type=release_type,
release_date=release_date,
- extra=extra)
+ extra=extra,
+ )
+
def run():
for line in sys.stdin:
@@ -90,5 +90,6 @@ def run():
if out:
print(out)
-if __name__=="__main__":
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..8a353ca 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is used to turn ingest request postgres rows (in JSON export
format) back in to regular ingest request JSON.
@@ -7,24 +6,25 @@ format) back in to regular ingest request JSON.
The only difference is the name and location of some optional keys.
"""
-import sys
-import json
import argparse
+import json
+import sys
def transform(row):
"""
dict-to-dict
"""
- row.pop('created', None)
- extra = row.pop('request', None) or {}
- for k in ('ext_ids', 'edit_extra'):
+ row.pop("created", None)
+ extra = row.pop("request", None) or {}
+ for k in ("ext_ids", "edit_extra"):
if k in extra:
row[k] = extra[k]
- if 'release_ident' in extra:
- row['fatcat'] = dict(release_ident=extra['release_ident'])
+ if "release_ident" in extra:
+ row["fatcat"] = dict(release_ident=extra["release_ident"])
return row
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -33,19 +33,27 @@ def run(args):
req = transform(json.loads(l))
except:
print(l, file=sys.stderr)
+ if args.force_recrawl:
+ req["force_recrawl"] = True
print(json.dumps(req, sort_keys=True))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="whether to add recrawl (SPNv2) flag to request",
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
to JSON format for fast fatcat importing.
"""
-import sys
import json
import sqlite3
+import sys
# iterate over rows in files metadata...
# 1. select all identified DOIs
@@ -20,6 +20,7 @@ import sqlite3
# 2. select all file metadata
# 3. output object
+
def or_none(s):
if s is None:
return None
@@ -27,6 +28,7 @@ def or_none(s):
return None
return s
+
def process_db(db_path):
db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
print(json.dumps(obj))
-if __name__=="__main__":
+
+if __name__ == "__main__":
process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,19 +1,18 @@
#!/usr/bin/env python3
-
"""
Transform an OAI-PMH bulk dump (JSON) into ingest requests.
Eg: https://archive.org/details/oai_harvest_20200215
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
-
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
# OAI specific additions
"://hdl.handle.net/",
]
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
RELEASE_STAGE_MAP = {
- 'info:eu-repo/semantics/draftVersion': 'draft',
- 'info:eu-repo/semantics/submittedVersion': 'submitted',
- 'info:eu-repo/semantics/acceptedVersion': 'accepted',
- 'info:eu-repo/semantics/publishedVersion': 'published',
- 'info:eu-repo/semantics/updatedVersion': 'updated',
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -50,38 +80,43 @@ def transform(obj):
"""
requests = []
- if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
return []
- if not obj.get('urls'):
+ if not obj.get("urls"):
return []
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
# look in obj['formats'] for PDF?
- if obj.get('formats'):
+ if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
# skip. Note that we will continue if there is no formats list.
has_pdf = False
- for f in obj['formats']:
- if 'pdf' in f.lower():
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
has_pdf = True
if not has_pdf:
return []
doi = None
- if obj.get('doi'):
- doi = obj['doi'][0].lower().strip()
- if not doi.startswith('10.'):
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
doi = None
# infer release stage and/or type from obj['types']
release_stage = None
- for t in obj.get('types', []):
+ for t in obj.get("types", []):
if t in RELEASE_STAGE_MAP:
release_stage = RELEASE_STAGE_MAP[t]
# TODO: infer rel somehow? Eg, repository vs. OJS publisher
rel = None
- for url in obj['urls']:
+ for url in obj["urls"]:
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in url:
@@ -94,23 +129,25 @@ def transform(obj):
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'oai',
- 'link_source_id': obj['oai'].lower(),
- 'ingest_request_source': 'metha-bulk',
- 'release_stage': release_stage,
- 'rel': rel,
- 'ext_ids': {
- 'doi': doi,
- 'oai': obj['oai'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": oai_id,
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "oai": obj["oai"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
+ if doi:
+ request["ext_ids"]["doi"] = doi
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -121,17 +158,20 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file",
help="OAI-PMH dump file to use (usually stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index e093dc3..8b57c5b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
@@ -7,6 +6,7 @@ Originally used to benchmark and compare file size/quality.
"""
import sys
+
import poppler
from PIL import Image
@@ -22,13 +22,16 @@ def run(inpath, outpath):
renderer = poppler.PageRenderer()
full_page = renderer.render_page(page)
- img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "RGBA", 0, 1)
- img.thumbnail((180,300), Image.BICUBIC)
- #img.thumbnail((360,600), Image.BICUBIC)
+ img = Image.frombuffer(
+ "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+ )
+ img.thumbnail((180, 300), Image.BICUBIC)
+ # img.thumbnail((360,600), Image.BICUBIC)
img.save(outpath)
- #img.save(outpath, quality=95)
+ # img.save(outpath, quality=95)
+
-if __name__ == '__main__':
+if __name__ == "__main__":
if len(sys.argv) != 3:
print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
sys.exit(-1)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..cb64a1a 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,41 +1,39 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
- "://archive.org/",
- ".archive.org/",
]
RELEASE_STAGE_MAP = {
- 'draftVersion': 'draft',
- 'submittedVersion': 'submitted',
- 'acceptedVersion': 'accepted',
- 'publishedVersion': 'published',
- 'updatedVersion': 'updated',
+ "draftVersion": "draft",
+ "submittedVersion": "submitted",
+ "acceptedVersion": "accepted",
+ "publishedVersion": "published",
+ "updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single unpaywall object to zero or more ingest requests.
@@ -43,48 +41,49 @@ def transform(obj):
"""
requests = []
- if not obj['doi'].startswith('10.'):
+ if not obj["doi"].startswith("10."):
return requests
- if not obj['oa_locations']:
+ if not obj["oa_locations"]:
return requests
- for location in obj['oa_locations']:
- if not location['url_for_pdf']:
+ for location in obj["oa_locations"]:
+ if not location["url_for_pdf"]:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in location['url_for_pdf']:
+ if domain in location["url_for_pdf"]:
skip = True
if skip:
continue
try:
- base_url = canon(location['url_for_pdf'])
+ base_url = canon(location["url_for_pdf"])
except UnicodeEncodeError:
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'unpaywall',
- 'link_source_id': obj['doi'].lower(),
- 'ingest_request_source': 'unpaywall',
- 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
- 'rel': location['host_type'],
- 'ext_ids': {
- 'doi': obj['doi'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "unpaywall",
+ "link_source_id": obj["doi"].lower(),
+ "ingest_request_source": "unpaywall",
+ "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+ "rel": location["host_type"],
+ "ext_ids": {
+ "doi": obj["doi"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
- if obj.get('oa_status'):
- request['edit_extra']['oa_status'] = obj['oa_status']
- if location.get('evidence'):
- request['edit_extra']['evidence'] = location['evidence']
- if location['pmh_id']:
- request['ext_ids']['pmh_id'] = location['pmh_id']
+ if obj.get("oa_status"):
+ request["edit_extra"]["oa_status"] = obj["oa_status"]
+ if location.get("evidence"):
+ request["edit_extra"]["evidence"] = location["evidence"]
+ if location["pmh_id"]:
+ request["ext_ids"]["pmh_id"] = location["pmh_id"]
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -95,17 +94,18 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="unpaywall dump file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
new file mode 100644
index 0000000..54d07db
--- /dev/null
+++ b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T22:08:45Z","timestamp":1620684525878},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-64953-1_4","type":"book-chapter","created":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T02:57:20Z","timestamp":1610593040000},"page":"53-71","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mathematical Knowledge and Mathematical Objects"],"prefix":"10.1007","author":[{"given":"Lars-G\u00f6ran","family":"Johansson","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,1,14]]},"reference":[{"key":"4_CR12","doi-asserted-by":"publisher","volume-title":"Deflating existential consequence: A case for nominalism","author":"J Azzouni","year":"2004","unstructured":"Azzouni, J. (2004). Deflating existential consequence: A case for nominalism. New York: Oxford University Press.","DOI":"10.1093\/0195159888.001.0001"},{"key":"4_CR23","doi-asserted-by":"publisher","volume-title":"Foundations of constructive mathematics","author":"M Beeson","year":"1985","unstructured":"Beeson, M. (1985). Foundations of constructive mathematics. Berlin\/Heidelberg: Springer.","DOI":"10.1007\/978-3-642-68952-9"},{"issue":"2","key":"4_CR27","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1093\/philmat\/11.2.176","volume":"11","author":"H Billinge","year":"2003","unstructured":"Billinge, H. (2003). Did bishop have a philosophy of mathematics? Philosophica Mathematica, 11(2), 176\u2013194.","journal-title":"Philosophica Mathematica"},{"key":"4_CR29","doi-asserted-by":"publisher","volume-title":"Constructive analysis","author":"E Bishop","year":"1985","unstructured":"Bishop, E., & Bridges, D. S. (1985). Constructive analysis. Berlin: Springer.","DOI":"10.1007\/978-3-642-61667-9"},{"key":"4_CR37","series-title":"In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.)","volume-title":"Nominalism in the philosophy of mathematics","author":"O Bueno","year":"2014","unstructured":"Bueno, O. (2014). Nominalism in the philosophy of mathematics. In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.). Metaphysics Research Lab, Stanford University."},{"key":"4_CR38","volume-title":"Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen","author":"G Cantor","year":"1883","unstructured":"Cantor, G. (1883). Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen. Leipzig: Teubner."},{"key":"4_CR60","volume-title":"The seas of language","author":"M Dummett","year":"1993","unstructured":"Dummett, M. (1993). The seas of language. Oxford: Clarendon Press."},{"key":"4_CR73","volume-title":"In the light of logic","author":"S Feferman","year":"1998","unstructured":"Feferman, S. (1998). In the light of logic. New York: Oxford University Press."},{"key":"4_CR74","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1093\/0195148770.003.0019","volume-title":"The Oxford handbook of philosophy of mathematics and logic","author":"S Feferman","year":"2005","unstructured":"Feferman, S. (2005). Predicativity. In S. Shapiro (Ed.), The Oxford handbook of philosophy of mathematics and logic (pp. 590\u2013624). New York\/Oxford: Oxford University Press."},{"key":"4_CR77","volume-title":"Science without numbers: A defence of nominalism","author":"H H Field","year":"1980","unstructured":"Field, H. H. (1980). Science without numbers: A defence of nominalism. Oxford: Blackwell."},{"key":"4_CR88","volume-title":"Werke, volume 8","author":"C F Gauss","year":"2011","unstructured":"Gauss, C. F. (2011). Werke, volume 8. Cambridge: Cambridge University Press."},{"key":"4_CR93","unstructured":"Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155\u2013172). Bobs-Merrill company."},{"key":"4_CR103","volume-title":"Mathematics without numbers: Towards a modal-structural interpretation","author":"G Hellman","year":"1989","unstructured":"Hellman, G. (1989). Mathematics without numbers: Towards a modal-structural interpretation. Oxford: Clarendon Press."},{"key":"4_CR126","first-page":"201","volume-title":"Bertrand Russell. Philosopher of the century","author":"G Kreisel","year":"1967","unstructured":"Kreisel, G. (1967). Mathematical logic: What has it done for the philosophy of mathematics? In R. Shoenman (Ed.), Bertrand Russell. Philosopher of the century (pp. 201\u2013272). London: George Allen & Unwin."},{"key":"4_CR135","doi-asserted-by":"crossref","unstructured":"Lear, J. (1980). Aristotelian infinity. Proceedings of the Aristotelian Society, New Series, 80, 187\u2013210.","DOI":"10.1093\/aristotelian\/80.1.187"},{"key":"4_CR175","doi-asserted-by":"publisher","first-page":"63","DOI":"10.12775\/LLP.1998.004","volume":"6","author":"F Pataut","year":"1998","unstructured":"Pataut, F. (1998). Incompleteness, constructivism and truth. Logic and Logical Philosophy, 6, 63\u201376.","journal-title":"Logic and Logical Philosophy"},{"key":"4_CR180","first-page":"294","volume":"14","author":"H Poincar\u00e9","year":"1906","unstructured":"Poincar\u00e9, H. (1906). Les math\u00e9matiques et la logique. Revue de m\u00e9taphysique et de morale, 14, 294\u2013317.","journal-title":"Revue de m\u00e9taphysique et de morale"},{"key":"4_CR190","volume-title":"Word and object","author":"W V O Quine","year":"1960","unstructured":"Quine, W. V. O. (1960). Word and object. Cambridge, MA: MIT Press."},{"key":"4_CR193","unstructured":"Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133\u2013136). Cambridge, MA: Harvard University Press."},{"key":"4_CR197","first-page":"31","volume-title":"Theories and things","author":"W V O Quine","year":"1981","unstructured":"Quine, W. V. O. (1981c). What price bivalence? In Theories and things (pp. 31\u201337). Cambridge, MA: The Belknap Press of Harvard University Press."},{"issue":"1","key":"4_CR198","doi-asserted-by":"publisher","first-page":"5","DOI":"10.2307\/2026889","volume":"89","author":"WV O Quine","year":"1992","unstructured":"Quine, W.V. O. (1992). Structure and nature. The Journal of Philosophy, 89(1), 5\u20139.","journal-title":"The Journal of Philosophy"},{"key":"4_CR199","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1080\/014453401625669","volume":"25","author":"P Raatikainen","year":"2004","unstructured":"Raatikainen, P. (2004). Conceptions of truth in intuitionism. History and Philosophy of Logic, 25, 131\u2013145.","journal-title":"History and Philosophy of Logic"},{"key":"4_CR210","unstructured":"Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29\u201353."},{"key":"4_CR212","volume-title":"Introduction to mathematical philosophy","author":"B Russell","year":"1919","unstructured":"Russell, B. (1919). Introduction to mathematical philosophy. London: Routledge."},{"key":"4_CR222","doi-asserted-by":"crossref","unstructured":"Schwarz, J. T. (2006(1966)). The pernicious influence of mathematics on science. In R. Hersch (Ed.), 18 unconventional essays on the nature of mathematics (Chap. 13, pp. 231\u2013235). New York: Springer.","DOI":"10.1007\/0-387-29831-2_13"},{"key":"4_CR233","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/BF00247187","volume":"12","author":"G Sundholm","year":"1983","unstructured":"Sundholm, G. (1983). Constructions, proofs and the meaning of logical constants. Journal of Philosophical Logic, 12, 151\u2013172.","journal-title":"Journal of Philosophical Logic"},{"issue":"2","key":"4_CR235","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1007\/s10701-007-9186-9","volume":"38","author":"M Tegmark","year":"2008","unstructured":"Tegmark, M. (2008). The mathematical universe. Foundations of Physics, 38(2), 101\u2013150.","journal-title":"Foundations of Physics"},{"key":"4_CR262","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/0010-0277(90)90003-3","volume":"36","author":"K Wynn","year":"1990","unstructured":"Wynn, K. (1990). Children\u2019s understanding of counting. Cognition, 36, 155\u2013193.","journal-title":"Cognition"}],"container-title":["Synthese Library","Empiricism and Philosophy of Physics"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-64953-1_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T03:00:39Z","timestamp":1610593239000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":28,"URL":"http:\/\/dx.doi.org\/10.1007\/978-3-030-64953-1_4","relation":{},"ISSN":["0166-6991","2542-8292"],"issn-type":[{"value":"0166-6991","type":"print"},{"value":"2542-8292","type":"electronic"}],"published":{"date-parts":[[2021]]},"assertion":[{"value":"14 January 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}} \ No newline at end of file
diff --git a/python/tests/files/crossref_api_work_s1047951103000064.json b/python/tests/files/crossref_api_work_s1047951103000064.json
new file mode 100644
index 0000000..dfb795d
--- /dev/null
+++ b/python/tests/files/crossref_api_work_s1047951103000064.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,6,10]],"date-time":"2021-06-10T05:35:02Z","timestamp":1623303302043},"reference-count":46,"publisher":"Cambridge University Press (CUP)","issue":"1","license":[{"start":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T00:00:00Z","timestamp":1113782400000},"content-version":"unspecified","delay-in-days":807,"URL":"https:\/\/www.cambridge.org\/core\/terms"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cardiol Young"],"published-print":{"date-parts":[[2003,2]]},"abstract":"<jats:p>We designed a multi-hospital prospective study of children less than 12 years to determine the comparative clinical profile, severity of carditis, and outcome on follow up of patients suffering an initial and recurrent episodes of acute rheumatic fever. The study extended over a period of 3 years, with diagnosis based on the Jones criteria. We included 161 children in the study, 57 having only one episode and 104 with recurrent episodes. Those seen in the first episode were differentiated from those with recurrent episodes on the basis of the history. The severity of carditis was graded by clinical and echocardiographic means. In those suffering their first episode, carditis was significantly less frequent (61.4%) compared to those having recurrent episodes (96.2%). Arthritis was more marked in the first episode (61.4%) compared to recurrent episodes (36.5%). Chorea was also significantly higher in the first episode (15.8%) compared to recurrent episodes (3.8%). Sub-cutaneous nodules were more-or-less the same in those suffering the first (7%) as opposed to recurrent episodes (5.8%), but Erythema marginatum was more marked during the first episode (3.5%), being rare in recurrent episodes at 0.9%. Fever was recorded in approximately the same numbers in first (45.6%) and recurrent episodes (48.1%). Arthralgia, in contrast, was less frequent in first (21.1%) compared to recurrent episodes (32.7%). A history of sore throat was significantly increased amongst those suffering the first episode (54.4%) compared to recurrent episodes (21.2%). When we compared the severity of carditis in the first versus recurrent episodes, at the start of study mild carditis was found in 29.8% versus 10.6%, moderate carditis in 26.3% versus 53.8%, and severe carditis in 5.3% versus 31.8% of cases, respectively. At the end of study, 30.3% of patients suffering their first episode were completely cured of carditis, and all others showed significant improvement compared to those with recurrent episodes, where only 6.8% were cured, little improvement or deterioration being noted in the remainder of the patients. We conclude that the clinical profile of acute rheumatic fever, especially that of carditis, is milder in those suffering their first attack compared to those with recurrent episodes.<\/jats:p>","DOI":"10.1017\/s1047951103000064","type":"journal-article","created":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T11:49:54Z","timestamp":1113824994000},"page":"28-35","source":"Crossref","is-referenced-by-count":11,"title":["Clinical profile of acute rheumatic fever in Pakistan"],"prefix":"10.1017","volume":"13","author":[{"given":"Hasina Suleman","family":"Chagani","sequence":"first","affiliation":[]},{"given":"Kalimuddin","family":"Aziz","sequence":"additional","affiliation":[]}],"member":"56","published-online":{"date-parts":[[2005,4,18]]},"reference":[{"key":"S1047951103000064_ref010","doi-asserted-by":"crossref","unstructured":"Alan L , Bisno . Group A streptococcal infection and acute rheumatic fever. N Engl J Med 1991; 325: 783\u2013793.","DOI":"10.1056\/NEJM199109123251106"},{"key":"S1047951103000064_ref036","doi-asserted-by":"crossref","unstructured":"Abbasi AS , Hashmi JA , Robinson RD , Suraya S , Syed SA . Prevalence of heart disease in school children of Karachi. Am J Cardiol 1966; 18: 544\u2013547.","DOI":"10.1016\/0002-9149(66)90008-7"},{"key":"S1047951103000064_ref025","unstructured":"Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285\u2013294."},{"key":"S1047951103000064_ref013","unstructured":"Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185\u2013192."},{"key":"S1047951103000064_ref007","doi-asserted-by":"crossref","unstructured":"Okoroma EO , Ihenacho HNC , Anyanwu CH . Rheumatic fever in Nigerian children. A prospective study of 66 patients. Am J Dis Child 1981; 35: 236\u2013238.","DOI":"10.1001\/archpedi.1981.02130270028010"},{"key":"S1047951103000064_ref031","doi-asserted-by":"crossref","unstructured":"Gordis L . Effectiveness of comprehensive care program in preventing rheumatic fever. N Engl J Med 1973; 289: 331\u2013335.","DOI":"10.1056\/NEJM197308162890701"},{"key":"S1047951103000064_ref012","unstructured":"Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21\u201324."},{"key":"S1047951103000064_ref026","doi-asserted-by":"crossref","unstructured":"Reale A , Colella C , Bruno AM . Mitral stenosis in childhood: Clinical and therapeutic aspects. Am Heart J 1963; 66: 15.","DOI":"10.1016\/0002-8703(63)90064-4"},{"key":"S1047951103000064_ref046","doi-asserted-by":"crossref","unstructured":"Aziz KU , Cheema L , Memon AD . Long-term observations of rheumatic carditis. Cardiol Young 1992; 2: 254\u2013260.","DOI":"10.1017\/S1047951100001001"},{"key":"S1047951103000064_ref041","unstructured":"Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300\u2013305."},{"key":"S1047951103000064_ref002","unstructured":"Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889."},{"key":"S1047951103000064_ref043","unstructured":"Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336\u2013345."},{"key":"S1047951103000064_ref037","unstructured":"Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2\u20136."},{"key":"S1047951103000064_ref029","doi-asserted-by":"crossref","unstructured":"Hassel TA , Stuart KL . Rheumatic fever prophylaxis. A three-year study. Br Med J 1972; 2: 39\u201340.","DOI":"10.1136\/bmj.2.5909.39"},{"key":"S1047951103000064_ref024","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Berry AM , Duggal S , Hooja V , Ghosh S . Sequel of initial attack of acute rheumatic fever. A prospective 5-year follow-up study. Circulation 1982; 65: 375\u2013379.","DOI":"10.1161\/01.CIR.65.2.375"},{"key":"S1047951103000064_ref022","doi-asserted-by":"crossref","unstructured":"Brownell KD , Rese FB . Acute rheumatic fever in children. Incidence in Borough of New York city. JAMA. 1973; 224: 1593\u20131597.","DOI":"10.1001\/jama.1973.03220260015004"},{"key":"S1047951103000064_ref035","unstructured":"Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071\u20131081."},{"key":"S1047951103000064_ref003","unstructured":"El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980's. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183\u2013203."},{"key":"S1047951103000064_ref045","doi-asserted-by":"crossref","unstructured":"Markowitz M . Eradication of rheumatic fever. An unfulfilled hope. Circulation 1970; 41: 1077\u20131084.","DOI":"10.1161\/01.CIR.41.6.1077"},{"key":"S1047951103000064_ref005","unstructured":"Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886."},{"key":"S1047951103000064_ref017","unstructured":"Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483\u2013494."},{"key":"S1047951103000064_ref028","doi-asserted-by":"crossref","unstructured":"Ehmke DA , Stehbens JA , Young L . Two studies of compliance with daily prophylaxis in rheumatic fever patients in Iowa. Am J Public Health 1980; 70: 1189\u20131193.","DOI":"10.2105\/AJPH.70.11.1189"},{"key":"S1047951103000064_ref021","doi-asserted-by":"crossref","unstructured":"Ward C . The reappraisal of the clinical features in acute and chronic rheumatic heart disease. Etiology implications. Am Heart J 1979; 98: 298\u2013306.","DOI":"10.1016\/0002-8703(79)90040-1"},{"key":"S1047951103000064_ref009","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Thaper MK , Ahmed SA , Hooja V , Tewari P . The initial attack of acute rheumatic fever during childhood in North India. A prospective study of the clinical profile. Circulation 1974; 49: 7\u201312.","DOI":"10.1161\/01.CIR.49.1.7"},{"key":"S1047951103000064_ref016","unstructured":"Strasser T . Rheumatic fever and rheumatic heart disease in the 1970's. WHO Chron. 1978; 32: 18\u201325."},{"key":"S1047951103000064_ref019","doi-asserted-by":"crossref","unstructured":"Bland EF , Jones TD . Rheumatic fever and rheumatic heart disease. A twenty-year report on 1000 patients followed since childhood. Circulation 1951; 4: 836\u2013843.","DOI":"10.1161\/01.CIR.4.6.836"},{"key":"S1047951103000064_ref042","doi-asserted-by":"crossref","unstructured":"Wood HF , McCarty M . Laboratory aids in the diagnosis of rheumatic fever and evaluation of disease activity. Am J Med 1954; 17: 768\u2013774.","DOI":"10.1016\/0002-9343(54)90221-1"},{"key":"S1047951103000064_ref020","doi-asserted-by":"crossref","unstructured":"Baldwin JS , Kerr JM , Kuttner AG , Doyle EF . Observation in rheumatic nodules over 30 years period. J Pediatr 1960; 56: 465\u2013470.","DOI":"10.1016\/S0022-3476(60)80358-7"},{"key":"S1047951103000064_ref004","doi-asserted-by":"crossref","unstructured":"Majeed HA , Khan N , Dabbagh M , Naidi K . Acute rheumatic fever during childhood in Kuwait: The mild nature of initial attack. Ann Trop Paediatr 1981; 1: 13\u201320.","DOI":"10.1080\/02724936.1981.11748053"},{"key":"S1047951103000064_ref001","unstructured":"Brittanica: Book of year 1991. Chicago, 1991."},{"key":"S1047951103000064_ref039","unstructured":"Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990."},{"key":"S1047951103000064_ref040","doi-asserted-by":"crossref","unstructured":"Taranta A , Markowitz M . Rheumatic fever. A guide to its recognition, prevention and cure, with special reference to developing countries. M.T.P. Press Ltd., Boston, 1981.","DOI":"10.1007\/978-94-015-7171-5"},{"key":"S1047951103000064_ref032","unstructured":"Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1\u201315."},{"key":"S1047951103000064_ref014","unstructured":"Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2\u20139."},{"key":"S1047951103000064_ref011","doi-asserted-by":"crossref","unstructured":"Gharib R . Acute rheumatic fever in Shiraz, Iran. It's prevalence and characteristics in two socio-economic groups. Am J Dis Child 1969: 118: 694\u2013699.","DOI":"10.1001\/archpedi.1969.02100040696005"},{"key":"S1047951103000064_ref008","unstructured":"Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543\u2013550."},{"key":"S1047951103000064_ref033","doi-asserted-by":"crossref","unstructured":"Spagnuolo M , Pasternack B , Taranta A . Risk of rheumatic fever recurrences after streptococcal infections. Prospective study of clinical and social factors. N Engl J Med 1971; 285: 641\u2013647.","DOI":"10.1056\/NEJM197109162851201"},{"key":"S1047951103000064_ref038","unstructured":"Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539\u2013549."},{"key":"S1047951103000064_ref023","doi-asserted-by":"crossref","unstructured":"Feinstein AR , Spagnuolo M . The clinical patterns of acute rheumatic fever; A reappraisal. Medicine 1962; 41: 279\u2013305.","DOI":"10.1097\/00005792-196212000-00001"},{"key":"S1047951103000064_ref018","unstructured":"Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501\u20131515."},{"key":"S1047951103000064_ref027","unstructured":"Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8\u201314."},{"key":"S1047951103000064_ref034","unstructured":"Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14\u201316."},{"key":"S1047951103000064_ref044","unstructured":"Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389\u2013395."},{"key":"S1047951103000064_ref006","unstructured":"Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849\u2013853."},{"key":"S1047951103000064_ref030","unstructured":"Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599\u2013603."},{"key":"S1047951103000064_ref015","doi-asserted-by":"crossref","unstructured":"Robinson RD , Sultana S , Abbasi AS et al. Acute rheumatic fever in Karachi, Pakistan. Am J Cardiol 1966; 8: 548\u2013551.","DOI":"10.1016\/0002-9149(66)90009-9"}],"container-title":["Cardiology in the Young"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.cambridge.org\/core\/services\/aop-cambridge-core\/content\/view\/S1047951103000064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,4,6]],"date-time":"2020-04-06T22:32:57Z","timestamp":1586212377000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2003,2]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2003,2]]}},"alternative-id":["S1047951103000064"],"URL":"http:\/\/dx.doi.org\/10.1017\/s1047951103000064","relation":{},"ISSN":["1047-9511","1467-1107"],"issn-type":[{"value":"1047-9511","type":"print"},{"value":"1467-1107","type":"electronic"}],"subject":["Cardiology and Cardiovascular Medicine","General Medicine","Pediatrics, Perinatology, and Child Health"],"published":{"date-parts":[[2003,2]]}}} \ No newline at end of file
diff --git a/python/tests/files/dlib_05vanhyning.html b/python/tests/files/dlib_05vanhyning.html
new file mode 100644
index 0000000..dbe3ef7
--- /dev/null
+++ b/python/tests/files/dlib_05vanhyning.html
@@ -0,0 +1,350 @@
+<!DOCTYPE html>
+<html lang="en" itemscope itemtype="http://schema.org/Article">
+<head>
+<script type="text/javascript" src="/js/ga.js"></script>
+<style type="text/css">
+
+.topLeft { border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftThick { border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftRight {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftRightThick {border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftBottom {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.all {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+table.plain {border-collapse: separate;
+ border-spacing: 0px;
+ margin-left: auto;
+ margin-right: auto;
+ }
+td.plain {padding: 6px;
+ vertical-align: text-top;
+ }
+
+table.author {border-collapse: separate;
+ border-spacing: 6px;
+ }
+td.authors {padding: 6px;
+ }
+
+li:not(:last-child) {
+ margin-bottom: .5em;
+ }
+
+div.center {margin-left: auto; margin-right: auto;
+ }
+
+</style>
+<meta charset="utf-8" />
+<meta id="DOI" content="10.1045/may2017-vanhyning" />
+<meta itemprop="datePublished" content="2017-05-15" />
+<meta id="description" content="D-Lib Magazine Article" />
+<meta id="keywords" content="Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS" />
+<link href="../../../style/style1.css" rel="stylesheet" type="text/css" />
+
+<title>Transforming Libraries and Archives through Crowdsourcing</title>
+</head>
+
+<body>
+<form action="/cgi-bin/search.cgi" method="get">
+
+<div style="height:2px;background:#2b538e"></div>
+<div style="height:4px;background:#4078b1"></div>
+
+<div style="height:30px;background:#4078b1">
+
+<span style="color: #ffffff; font-size: 12px; float: right; margin-right: 10px;">Search D-Lib:
+<input type="text" id="words" value="" size="25" />
+<input type="submit" id="search" value="Go!" />
+<input type="hidden" id="config" value="htdig" />
+<input type="hidden" id="restrict" value="" />
+<input type="hidden" id="exclude" value="" />
+</span>
+</div>
+
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:1px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:1px;background:#2b538e"></div>
+<div style="height:92px;background:#4078b1"><img width="450" height="90" alt="D-Lib-blocks5" src="../../../img2/D-Lib-blocks5.gif">
+</div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#e04c1e"></div>
+<div style="height:24px;background:#eda443"><img src="../../../img2/magazine5.gif" alt="The Magazine of Digital Library Research" width="830" height="24" /></div>
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:28px;background:#2b538e">
+<div id="navtable">
+<table>
+<tr><td class="navtext"><img src="../../../img2/transparent.gif" alt="" width="20" height="20" /><a href="../../../dlib.html">HOME</a>&nbsp;|&nbsp;<a href="../../../about.html">ABOUT D-LIB</a>&nbsp;|&nbsp;<a href="../../../contents.html" class="navtext">CURRENT ISSUE</a>&nbsp;|&nbsp;<a href="../../../back.html">ARCHIVE</a>&nbsp;|&nbsp;<a href="../../../author-index.html">INDEXES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/groups.html">CALENDAR</a>&nbsp;|&nbsp;<a href="../../author-guidelines.html">AUTHOR GUIDELINES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/mailman/listinfo/dlib-subscribers">SUBSCRIBE</a>&nbsp;|&nbsp;<a href="../../letters.html">CONTACT D-LIB</a></td></tr></table></div></div>
+<div style="height:4px;background:#2b538e"></div>
+<div style="height:1px;background:#e04c1e"></div>
+
+<div style="padding-left: 2.5em; padding-top: 1em;">
+
+<h3 class="blue-space">D-Lib Magazine</h3>
+<p class="blue">May/June 2017<br />
+Volume 23, Number 5/6<br />
+<a href="../05contents.html">Table of Contents</a>
+</p>
+
+<div class="divider-full">&nbsp;</div>
+
+<h3 class="blue-space">Transforming Libraries and Archives through Crowdsourcing</h3>
+
+<p class="blue">Victoria Van Hyning, University of Oxford, Zooniverse<br />
+victoria [at] zooniverse.org<br /><br />
+
+Samantha Blickhan, The Adler Planetarium, Zooniverse<br />
+samantha [at] zooniverse.org<br /><br />
+
+Laura Trouille, The Adler Planetarium, Zooniverse<br />
+trouille [at] zooniverse.org<br /><br />
+
+Chris Lintott, University of Oxford, Zooniverse<br />
+chris [at] zooniverse.org</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p><a href="https://doi.org/10.1045/may2017-vanhyning" class="nolinka">https://doi.org/10.1045/may2017-vanhyning</a></p>
+
+<div class="divider-full">&nbsp;</div>
+ <!-- Abstract or TOC goes here -->
+
+<h3 class="blue">Abstract</h3>
+
+<p class="blue">This article will showcase the aims and research goals of the project entitled "Transforming Libraries and Archives through Crowdsourcing", recipient of a 2016 Institute for Museum and Library Services grant. This grant will be used to fund the creation of four bespoke text and audio transcription projects which will be hosted on the Zooniverse, the world-leading research crowdsourcing platform. These transcription projects, while supporting the research of four separate institutions, will also function as a means to expand and enhance the Zooniverse platform to better support galleries, libraries, archives and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.</p>
+
+<p class="blue">Keywords: Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS</p>
+
+<!-- Article goes next -->
+
+<div class="divider-full">&nbsp;</div>
+<h3>1 Overview<span style="vertical-align: super;"><a href="#n6">1</a></span></h3>
+
+<p>As libraries, museums, and other cultural repositories digitize their collections and place them online, the challenges of transforming these materials into useful and searchable sources of information are becoming increasingly apparent. While OCR and handwriting recognition technology have opened up some print and manuscript corpora, and image and voice recognition software are improving daily, there are still many tasks that require human intervention. For these, volunteer crowdsourcing is a viable and vibrant solution.</p>
+
+<p>The <a href="https://www.zooniverse.org/">Zooniverse</a> is the world-leading research crowdsourcing platform, hosting over 50 active projects and over 100 projects total since its inception in 2007. The projects cover diverse subject areas from astronomy to zoology, engage over 1.5 million registered volunteers, and have produced data used in more than a hundred peer-reviewed articles.<span style="vertical-align: super;"><a href="#n1">2</a></span> The Zooniverse also hosts the <a href="https://www.zooniverse.org/lab">Project Builder</a>, a free platform through which anyone can build their own project. The Zooniverse grew from a single project developed at the University of Oxford in 2007, and is now developed and managed by a team based in Oxford and at the Adler Planetarium in Chicago and the University of Minnesota (see <a href="https://www.zooniverse.org/about/team">Zooniverse Team</a> for a more complete list).</p>
+
+<p>In late 2016, the Institute for Museum and Library Services awarded a National Leadership Grant titled "Transforming Libraries and Archives through Crowdsourcing (LG-71-16-0028-16)" to the Adler Planetarium and its collaborators to support the work of the Zooniverse. Through this grant-funded effort, the Zooniverse will further expand and enhance its platform to better support galleries, libraries, archives, and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.1 What Can Crowdsourcing Offer GLAMs?</h4>
+
+<p>In 2010, author and professor Clay Shirky delivered a rousing <a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">TED</a> talk in which he used the phrase "cognitive surplus" to describe the one trillion hours of leisure time humans collectively accumulate each year (a great deal of which is spent watching television), which could be harnessed to advance human knowledge through civic engagement. He concluded that: "free cultures get what they celebrate. [...If we] celebrate and support and reward the people trying to use cognitive surplus to create civic value [...] we'll be able to change society".[<a href="#1">1</a>] One way that GLAMs can harness this cognitive surplus is through web-based crowdsourcing. What Shirky was describing was a type of "social machine", which Tim Berners-Lee defined as "new form[s] of social processes" emergent from the Web, and involving both human and machine components.[<a href="#2">2</a>] </p>
+
+<p>Academic crowdsourcing invites members of the public to work with specialists to conduct research: for example, to transcribe documents or add metadata to a collection of images, video or audio clips. This data is used in real science, social science, or humanities investigations and should, ideally, lead to publication. Crowdsourcing within GLAMs may not always be oriented around a specific research question or publication, but around making collections more accessible for future research and usability. GLAM crowdsourcing can be the seedbed of future scholarly research.</p>
+
+<p>GLAMs have been engaging volunteers with their collections for well over a century, usually by inviting select individuals into an institution and training them to do work that cannot be done by staff due to time or money constraints. On-site volunteers often build up valuable knowledge and skills and contribute a great deal to their chosen institutions, but training and supervising them also poses challenges. There is a limit to how many volunteers can be trained, supported on site, and indeed attracted and retained in the first place. Online volunteering, enabled by crowdsourcing platforms such as Zooniverse.org, offer an alternative or complementary form of engagement that has many benefits. Online projects can reach a wider range of individuals, including those who are less able-bodied or geographically remote from the institution in which they want to volunteer and/or unable to travel. Such projects require less training and time commitment from volunteers and typically attract a larger number of participants than on-site programs. They also enable GLAMs to open up rare collections to the public without concern for their material safety and security.<span style="vertical-align: super;"><a href="#n2">3</a></span></p>
+
+<p>While crowdsourcing projects have proliferated in the last decade, few offer easy to use, open source, and free platforms on which GLAM academics and amateur users can rely. The Zooniverse has the infrastructure, community, and technical expertise to intervene at this critical stage. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.2 How Does The Zooniverse Work?</h4>
+
+<p>All bespoke Zooniverse projects, including those built on the free Project Builder, have a few core components. Each image, audio or video file (data point) in each project is independently assessed by multiple individuals, whose responses are then aggregated using a variety of algorithms to determine what is in a given image. The amount of required responses for a task to be considered "complete" varies, depending on the project. With relatively quick tasks, such as animal identification in Snapshot Serengeti, upwards of 70 people will see each image. In tasks that require more time, such as transcription projects like <a href="https://www.shakespearesworld.org/#!/">Shakespeare's World</a> and <a href="https://anno.tate.org.uk/#!/">AnnoTate</a>, at least three people transcribe each line on each page. If enough people transcribe the same line and our algorithms deem the line to be completed to a good enough standard, these are greyed out, while outstanding lines are available to future site visitors. This approach was designed along the same principles that underpin all other Zooniverse projects, in which it is assumed that volunteers should work independently on tasks, in order that no one individual should have undue influence over others in the crowd. In the current IMLS project, however, we will test whether allowing volunteers to transcribe and work collaboratively ultimately creates better data and/or better user experiences. We will be able to compare datasets from AnnoTate and Shakespeare's World with text transcription datasets from the two new bespoke text transcription projects and, hopefully, with datasets generated at other institutions that have online crowdsourcing projects. Zooniverse is in a unique position in being able to gather these two very different kinds of data and compare them in order to determine the best outcomes. These findings will ultimately drive our design of free tools on the Project Builder.
+
+<p>In addition to participating in the classification task, users have the opportunity to communicate with other volunteers through an active, object-oriented discussion forum, called "Talk", associated with each project. Here volunteers can ask questions, interact with researchers and fellow volunteers, create their own "collections", and use hashtags to group together posts or images of interest. An example of the latter is <a href="https://talk.sciencegossip.org/#/search?tags%5Bfemale%5D=true">#female</a> from the <a href="https://www.sciencegossip.org/">Science Gossip</a> project, which indicates female authors, illustrators and printers contributing to the main scientific journals in the nineteenth century (visit the <a href="https://talk.sciencegossip.org/#/boards/BSC0000004/discussions/DSC00004s8">Science Gossip Talk</a> board to view the discussion around this tag). These interactions provide a rich set of experiences that allow users to personally experience the community in which they are participating, beyond simply providing classifications. Additionally, the collections allow volunteers to create their own research focal points within existing projects. During the process of transcribing, users can save images that contain content that is pertinent to their research interests by adding them to a public collection. They can then use the Talk forum to publicize their search, allowing other users to add images to that collection as well. In this way, the volunteer base can be mobilized to help other volunteers with minimal effort required.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>2 IMLS Funded Effort: Approach and Focus</h3>
+
+<p>Through the IMLS grant, the Zooniverse will engage in a research and development program to identify and implement crowdsourcing best practices in the arenas of text and audio transcription for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read. Though to date the majority of Zooniverse projects have been based in STEM fields rather than in the humanities, several text transcription projects have already been hosted on the site. For example, the first Zooniverse humanities project was <a href="https://www.ancientlives.org/">Ancient Lives</a>, which invited volunteers to transcribe ancient papyri one letter at a time using a clickable keyboard on their screen: volunteers did not have to be fluent in ancient Greek, they only needed to character match. Over 250,000 volunteers participated in the project, and made more than 1.5 million transcriptions between 2011 and 2014.[<a href="#6">3</a>] Furthermore, the computational pipeline used to convert individual identified letters into consensus-based transcriptions will benefit future classification projects attempting consensus letter or line sequence identifications.[<a href="#7">4</a>]</p>
+
+<p>By 2018 we will build four bespoke projects, two projects for text transcription and two projects for audio transcription, identified through open calls, in order to test, iterate, and research the efficacy of new and existing approaches (including within current Zooniverse and other projects) in these arenas. We will also develop the foundation for a GLAM-friendly data pipeline to export data from a Zooniverse project into GLAM collections. These functionalities are among those most frequently requested by GLAM institutions. We will work closely with four different GLAM institutions to build these bespoke crowdsourcing projects and functionalities. The text transcription open call closed in February 2017, with thirty-one submissions. The audio transcription open call will occur in fall 2017 (see <a href="http://zooniverse.org/get-involved/call-for-projects">Call for Projects</a>).</p>
+
+<p>From the lessons learned in building these bespoke projects, we will explore adding new tools and functionality to the Project Builder, which is freely available to any institution or user who wishes to lead a project. It is a flexible, powerful, and easy-to-use resource for building crowdsourcing projects, with a wide range of potential applications for GLAM collections, including text transcription. A basic text transcription tool is currently available, but will be refined through this grant effort. The Zooniverse has previously used this model of building bespoke projects in order to learn which tools are most useful, before implementing these tools in the Project Builder. We recognize that volunteers' time is precious, and are therefore unwilling to waste it with tools that are not proven to extract data in an efficient, high quality, and useful form. We will also draw on lessons learned from previous experiences supporting transcription projects through Zooniverse and other platforms. For example, <a href="https://www.operationwardiary.org/">Operation War Diary</a> which launched in 2014 to commemorate the outbreak of the First World War, is a partnership between the National Archives (UK), the Imperial War Museum, and the Zooniverse, which invites users to tag and transcribe dates, times, places, and names found in British WWI field diaries. Historian Richard Grayson has used the data to penetrate more deeply than ever before into records of soldiers' daily lives on the front.[<a href="#8">5</a>] All of the Operation War Diary metadata will eventually be integrated into the National Archive catalogues. The process of integrating new metadata into an existing catalogue can be complicated, raising an important question for any GLAM specialist seeking to harness crowdsourcing at their institution. For instance, it is essential to ensure, before starting a project, that the current content management system (CMS) supports the storage of additional metadata, such as large amounts of free-text. If not, it then becomes necessary to use an external resource to make available the results from the crowdsourcing project. Zooniverse can and will do more to facilitate GLAMs and research groups to use and store their data.</p>
+
+<p>Over the course of the IMLS project, we will also address the following research questions:</p>
+
+<p class="indentLeft">Q1: How can crowdsourcing be deployed in the arenas of text and audio transcription and metadata extraction for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read? What methods produce the best data and make for the best user experience?</p>
+
+<p class="indentLeft">Q2: Does the current Zooniverse methodology of multiple independent transcribers and aggregation render better results than allowing volunteers to see previous transcriptions by others or indeed collaborate to create a single transcription? How does each methodology impact the quality of data, as well as depth of analysis and participation?</p>
+
+<p class="indentLeft">Q3: How can we extend our crowdsourcing expertise to more GLAM professionals and learn from them, in turn, how to adjust the Zooniverse platform to best meet their research and curatorial needs?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.1 Addressing Q1 (Crowdsourcing for GLAM)</h4>
+
+<p>Only a platform like the Zooniverse can systematically address a question such as Q1: the community that has developed within the platform is made up of volunteers who move across projects, allowing us to trace the impact of differences between projects on the same volunteers. Zooniverse also has the infrastructure to implement A/B split experiments within a single project. This allows us to develop projects incorporating different practices which are specifically aimed at understanding different methodologies. Through the bespoke text and audio transcription projects, we will expand on the lessons learned through current Zooniverse text transcription projects, including Ancient Lives, AnnoTate, Old Weather, Measuring the ANZACs, Shakespeare's World, Science Gossip, Decoding the Civil War, Orchid Observers and Operation War Diary, as well as from external text transcription projects including <a href="http://blogs.ucl.ac.uk/transcribe-bentham/">Transcribe Bentham</a>, <a href="http://fromthepage.com/">FromthePage</a>, and <a href="http://scripto.org/">Scripto</a>. </p>
+
+<p>In the bespoke projects created through the IMLS grant, the features optimizing volunteer engagement and retention will include: </p>
+
+<ul>
+ <li><i>Volunteer choice:</i> volunteers choose which document to transcribe and can transcribe as little as a single line or as much as an entire document. We have found through AnnoTate and Shakespeare's World that allowing users to transcribe smaller fragments of text (without being required to complete an entire page) mitigates against forced or uncertain readings. We hypothesize and plan to fully test whether allowing microtasking helps to retain volunteers, giving them the chance to build up their skills and not make forced readings. </li>
+
+ <li><i>Keeping the task simple:</i> in Shakespeare's World and AnnoTate, volunteers drop points at the start and end of individual lines of text (not grammatical sentences) and transcribe the text contained between these two points. They do not use XML markup itself, which has proven to be a major repellent to participants in other text transcription crowdsourcing projects.<span style="vertical-align: super;"><a href="#n3">4</a></span> Instead, volunteers highlight words within the transcribed line and choose among different features (e.g., insertion, deletion, expansion, etc.). We propose to use these tagged words in each line to create simple TEI markup on the back-end, for output into commonly used CMSs such as Drupal and Omeka.</li>
+
+ <li><i>Narrowing the content focus to support sense-making:</i> In Shakespeare's World, the first release (or "chapter") consists of recipes and letters, with more genres to follow. This type of structured approach will be applied to the bespoke projects, as this supports creation of narratives within diverse collections, which in turn enables subject experts to more easily foster, and volunteers to contribute to, discussions in Talk.</li>
+</ul>
+
+<p>Features optimizing best practice in regard to data production and management will include:</p>
+
+<ul>
+ <li><i>Reliable, Scalable, Open Source Code Infrastructure:</i> The foundation for the Zooniverse platform that includes the Project Builder is an application written in Ruby on Rails which supports a powerful Application Programming Interface (API). The API serves subjects &#151; images, video or audio &#151; for classification by volunteers via a workflow defined by the project, and receives and records these classifications into a database. The frontend Javascript web software presents user interfaces to volunteers and supports the Project Builder. All Zooniverse code is open source and available through <a href="github.com/zooniverse">Github</a>.</li>
+
+ <li><i>Data Ingestion into Zooniverse:</i> In the current Project Builder, research teams can upload batches of 500 to 1000 subjects (images, videos, or audio clips) at a time by simply dragging and dropping the files. For larger collections and for bespoke projects, typically the research team provides a hard drive and the Zooniverse team uploads the subjects to the API. Through the projects proposed here, we will create a system to better support direct ingestion of large subject sets through a user-friendly web interface, adding functionality to the foundation we already have in place within the Project Builder.</li>
+
+ <li><i>Useful Output for Curation:</i> The Smithsonian Transcription Center is regularly cited as being successful in regard to their output being easily ingestible by CMSs.[<a href="#9">6</a>] Current Zooniverse transcription projects are not set up with this functionality. Currently, through our Project Builder for image annotation/marking projects, research teams can download the raw classification results (i.e. all classifications by all volunteers) as well as automatically-generated aggregated results that include confidence measures on consensus. Through this IMLS-funded effort, we will work with Meghan Ferriter of the Smithsonian Transcription Center, who is on our board of advisors, to design data outputs for full text transcription and full audio transcription that are suitable for ingestion into different GLAM CMSs. A key aspect of this effort is to continue exploring best practices and approaches for transcription aggregation and confidence metrics, building on our efforts with AnnoTate, Shakespeare's World, etc.</li>
+</ul>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.2 Addressing Research Q2 (Independent vs. Collaborative Transcription)</h4>
+
+<p>Through the two bespoke text transcription projects, we will investigate the impact on transcription quality and volunteer experience when volunteers transcribe in isolation versus with knowledge of how others have transcribed the same document. </p>
+
+<p>In terms of measuring impact on transcription quality, we will compare the rate of accuracy for individuals who transcribe in isolation on projects such as AnnoTate and Shakespeare's World versus individuals who see previous transcriptions. We will also compare the rate of accuracy in aggregated results for lines transcribed only by those working in isolation versus for lines in which all but the first transcriber sees previous transcriptions. In order to measure impact on volunteer experience, we will analyze the user behavior statistics we gather, e.g., number of transcriptions completed in a given session, length of session, number of sessions overall, sentiment analysis of discussion forum comments, etc.</p>
+
+<p>There are numerous open questions in this experiment: Does knowledge of other individuals' or collective transcriptions lead individuals down the wrong path? Is transcription more or less accurate if people work in isolation or with an awareness of other people's work? Does making transcriptions visible increase retention as a result of highlighting that an individual's effort is part of a broader community effort or have the opposite effect? What environment best promotes skills acquisition, i.e. improved paleography?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.3 Addressing Research Q3 (Feedback/Training)</h4>
+
+<p>We will provide numerous opportunities for input and feedback from and training for the GLAM community, specifically by working closely with our advisory board and four GLAM project partners throughout. In 2018 we will host feedback sessions at GLAM conferences and summer schools targeting GLAM institutions with collections for which text transcription, audio transcription, or image annotation/marking are of interest (we will include image annotation/marking because those tools are already included via the Project Builder). This will allow for input from a broader set of institutions on our decisions and approach for building new functionality into the Project Builder. In 2018&#151;2019 we will host training workshops for GLAM professionals in using the Project Builder to build their own crowdsourcing projects, incorporate the results into their databases and research, and sustain and nurture their online volunteer communities.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>3 Future Steps: Community Engagement, Output &amp; How to Get Involved</h3>
+
+<p>The IMLS-Funded Project "Transforming Libraries and Archives through Crowdsourcing" is still in its beginning stages. Currently, we are in the process of selecting the first two bespoke crowdsourcing text transcription projects to be built and incorporated into the Zooniverse platform. The detail of our research questions will evolve alongside these new transcription projects, and during the research and development process we will use conference presentations and feedback sessions to gather input which can then guide the overall project design. The open call for the two bespoke audio transcription projects will occur in the fall of 2017. At this point, the bespoke text transcriptions will be in beta review, allowing us to take advantage of lessons learned through that first round of new projects. We believe that this self-reflexive method will simultaneously benefit our ongoing project while offering new tools and ideas to the larger GLAM and academic community.</p>
+
+<p>We anticipate this proposed effort will produce two peer-reviewed publications. One article will focus on the methodology for creating, processing, and evaluating the data produced by the new projects. The second will focus on the results of our research exploring the impact of individual versus collaborative text transcription. We also note that all Zooniverse <a href="github.com/zooniverse">code</a> is freely available under a liberal open source license which serves as an additional or parallel form of publication.</p>
+
+<p>GLAM organizations keen to develop their own crowdsourcing projects should explore the available documentation on <a href="https://www.zooniverse.org/lab-how-to">how to build a project</a> and <a href="https://www.zooniverse.org/lab-best-practices/great-project">best practices for the design, launch and long term phases of a project</a>. While building a project is easy and requires relatively little technical support from Zooniverse or your institution, make sure you have the time to work with your resulting data, and time to support your online volunteer commmunity. Advertising the project's existence should be a long-term task, to avoid a plateau or potential drop-off of user participation. For example, Shakespeare's World received a bump in the number of daily classifications after an article was published in The New Yorker in January of 2017, over a year after the project's launch date.[<a href="#10">7</a>] However, it does not suffice to merely advertise the existence of a project; researchers need to engage with their users on a regular basis.<span style="vertical-align: super;"><a href="#n5">5</a></span> Zooniverse's Talk platform, social media such as blogging, Twitter, Instagram, and indeed in-person or on-site events all provide important channels for engaging current or potential volunteers with your collections. We believe that GLAM organizations, with their long history of volunteer engagement, have many of the skills to work effectively with online volunteers, and will benefit in new ways through cooperation with the crowd.</p>
+
+<p>In conclusion, while this project is specifically focused on text and audio transcription, it is our hope that the results, including the new Project Builder tools and GLAM data pipeline, will ultimately be used across a variety of disciplines and domains. We hope to facilitate future partnerships between GLAM institutions and volunteer communities around the world, thus extending the aims and outcomes of the National Digital Platform funded through this generous IMLS grant into an international digital platform that will benefit many individuals and institutions. </p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>Notes</h3>
+
+<table style="width:90%">
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n6">1</a></td>
+<td style="padding-top: .5em;">Part of this article appeared previously as a blog post for CILIP, The Library and Information Association. Material is reproduced by express permission of CILIP.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n1">2</a></td>
+<td style="padding-top: .5em;">For a partial list of publications, please visit <a href="https://www.zooniverse.org/about/publications">https://www.zooniverse.org/about/publications</a>. </td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n2">3</a></td>
+<td style="padding-top: .5em;">Further discussion of the use of crowdsourcing in GLAM contexts can be found in Melissa Terras, "Crowdsourcing in the Digital Humanities", in <i>A New Companion to Digital Humanities</i>, eds. Susan Schreibman, Ray Siemens, and John Unsworth (John Wiley &amp; Sons, 2016), 420-438, particularly in the section entitled "The Growth of Crowdsourcing in Cultural and Heritage Applications" (pp. 423-28). See also <i>Crowdsourcing Our Cultural Heritage</i>, ed. Mia Ridge (Ashgate, 2014).</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n3">4</a></td>
+<td style="padding-top: .5em;">Causer and Terras, "Many Hands Make Light Work", p. 81: "It would be fair to say that for volunteers, the XML mark-up complicates participation, and it has undoubtedly dissuaded many from participating more fully, or at all." For opinions from the volunteers about the process, the authors additionally refer the reader to Causer and Valerie Wallace, "<a href="http://www.digitalhumanities.org/dhq/vol/6/2/000125/000125.html">Building a Volunteer Community: Results and Findings from Transcribe Bentham</a>", <i>Digital Humanities Quarterly</i> 6.2 (2012).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n5">5</a></td>
+<td style="padding-top: .5em;">Or, as Zephyr Frank, <i>et al</i>. put it: "Paid advertising can generate large numbers of clicks on a website. It cannot, however, produce good metadata or newly uploaded material that is relevant to the scholarly questions posed by academic researchers." "<a href="https://github.com/cestastanford/crowdsourcing/raw/master/files/Mellon%20White%20Paper.pdf">Crowdsourcing for Humanities Research</a>" (2016) Project White Paper. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>References</h3>
+
+<table style="width:90%">
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="1">[1]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Clay Shirky, "<a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">How Cognitive Surplus Will Change the World</a>", June 2010.</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="2">[2]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Tim Berners-Lee with Mark Fischetti, <i>Weaving the Web: The Original Design and Ultimate Destiny of the World Wide Web by its Inventor</i> (San Francisco: Harper, 1999).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="6">[3]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">"P.Oxy 5156, Plutarch Moralia 660C, 661B-C (Quaestiones Convivales IV PR., 1.2)", in <i>The Oxyrhynchus Papyri</i>, R.-L. Chang <i>et al</i>., eds, vol. 78 (London, Egypt Exploration Society, 2012), 97-98. </td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="7">[4]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Alex C. Williams <i>et al.</i>, "A Computational Pipeline for Crowdsourced Transcriptions of Ancient Greek Papyrus Fragments", in <i>IEEE International Conference on Big Data</i>, October 2014. <a href="https://doi.org/10.1109/BigData.2014.7004460">https://doi.org/10.1109/BigData.2014.7004460</a></td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="8">[5]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Richard Grayson, "A Life in the Trenches? The Use of Operation War Diary and Crowdsourcing Methods to Provide an Understanding of the British Army's Day-to-Day Life on the Western Front", <i>British Journal for Military History,</i> 2.2 (2016), 160-85.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="9">[6]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Katie Mika, "<a href="http://library.mcz.harvard.edu/blog/transcription-tools-survey-katie-mika-ndsr-resident">Transcription Tools: a survey by Katie Mika, NDSR Resident</a>", Harvard University, Ernst Mayr Library Blog.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="10">[7]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Roberta Kwok, "<a href="http://www.newyorker.com/tech/elements/crowdsourcing-for-shakespeare">Crowdsourcing For Shakespeare</a>", <i>The New Yorker</i>, 16 Jan. 2017. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>About the Authors</h3>
+
+<p class="blue"><b>Victoria Van Hyning</b> is a Junior Research Fellow at Pembroke College, and a British Academy Postdoctoral Fellow. Her current project, 'Court to Convent: Early Modern English Catholic Women's Autobiography', will reveal how Catholic women articulated selfhood in the period when it was illegal to practice Catholicism, 1535 to 1829. She is also the Humanities PI of Zooniverse.org, the world leading academic crowdsourcing organization. Her projects include <a href="https://www.sciencegossip.org">Science Gossip</a>, <a href="http://www.shakespearesworld.org">Shakespeare's World</a> and <a href="https://anno.tate.org.uk">AnnoTate</a>.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Samantha Blickhan</b> is the IMLS Postdoctoral Fellow in the Department of Citizen Science at the Adler Planetarium, working on transcription projects for the Zooniverse. She received her Ph.D. in Musicology from Royal Holloway, University of London, with a thesis on the palaeography of British song notation in the 12th and 13th centuries. Her research interests include music and perception, and their relationships with writing systems, technology and pedagogy.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Laura Trouille</b> is co-Investigator for Zooniverse and Director of Citizen Science at the Adler Planetarium where she leads the Zooniverse web development and Teen Programs teams. While earning her Ph.D. in astronomy in 2010 studying galaxy evolution, she also earned the Center for the Integration of Research, Teaching and Learning's Delta certificate for STEM education research. As a CIERA Postdoctoral Fellow at Northwestern University's CIERA Center for Astrophysics, she continued her research on active galaxies as well as co-led the Computational Thinking in STEM project, bringing computational thinking and modeling curricular materials to high school science and math teachers. </p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue">Chris Lintott is a professor of astrophysics at the University of Oxford, where he is also a research fellow at New College. He is the principle investigator for Galaxy Zoo and the Zooniverse, and his own research focuses on novel modes of crowdsourcing for anomaly detection.</p>
+
+<div class="divider-full">&nbsp;</div>
+
+ <!-- Standard Copyright line here -->
+
+<div class="center">
+<p class="footer">Copyright &reg; 2017 Victoria Van Hyning, Samantha Blickhan, Laura Trouille and Chris Lintott</p>
+</div>
+
+<div style="height:1px;background:#2b538e"></div>
+
+</div>
+</form>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_fulltext.html b/python/tests/files/first_monday_ojs3_fulltext.html
new file mode 100644
index 0000000..2248aed
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_fulltext.html
@@ -0,0 +1,441 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<title>Surveillance, stigma and sociotechnical design for HIV</title>
+</head>
+<body bgcolor="#ffffff" LINK="#bb7777" VLINK="#7777bb" ALINK="#ffee99" text="#000000">
+<blockquote><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71629" border="1" alt="First Monday" align="bottom"><br></blockquote>
+<hr>
+<blockquote>
+
+<center><a href="#author"><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71975" alt="Surveillance, stigma and sociotechnical design for HIV by Calvin Liang, Jevan Alexander Hutson, and Os Keyes" border="1"></a></center>
+
+<br><hr><br>
+
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71627" alt="Abstract"><br>Online dating and hookup platforms have fundamentally changed people&rsquo;s day-to-day practices of sex and love &mdash; but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms &ldquo;work&rdquo; for HIV frequently focus on user-to-user interactions and disclosure of one&rsquo;s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+
+<p><strong>Contents</strong></p>
+<p><a href="#p1">Introduction</a><br>
+<a href="#p2">Methods</a><br>
+<a href="#p3">Findings</a><br>
+<a href="#p4">Discussion</a><br>
+<a href="#p5">Conclusion</a></p>
+
+<p>&nbsp;</p><hr><p>&nbsp;</p>
+<p><strong><a name="p1"></a>Introduction</strong></p>
+
+<table width="70%" align="center"><tr><td>&ldquo;AIDS is essentially a crisis of governance, of what governments do and do not do, to and for their people &mdash; we have the drugs to treat HIV infection, we have the tools to confront the risks that drive HIV transmission and prevent infection itself &mdash; what we don&rsquo;t have is national political will necessary to scale-up our response. We have demanded too little from our leaders, excused far too much.&rdquo;<br>&mdash; Gregg Gonsalves, speech at the 2006 Toronto AIDS Conference.</td></tr></table>
+
+<table width="70%" align="center"><tr><td>&ldquo;Design is inherently about change &mdash; not just in the creation of new material artifacts, but in the ways that new technological objects afford new practices, social habits, and ways of living and interacting.&rdquo;<br>&mdash; Dombrowski, <em>et al.</em> (2016). &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments.&rdquo;</td></tr></table>
+
+<p>Living and loving with HIV is a complicated task. HIV status and the stigma attached to it exists within a complex interplay of social norms and medicolegal infrastructures. The medicolegal history of HIV begins the moment that HIV and AIDS emerged, constituting a mix of medically justified legal norms and legally enforced medical requirements. The criminal justice and public health systems of modern states demarcated people living with HIV as a uniquely dangerous population, &ldquo;one that needed to be sought out, tracked down, tested, reported, listed, tagged, monitored, regulated, and, increasingly, criminalized&rdquo; <a name="1a"></a>[<a href="#1">1</a>].</p>
+
+<p>The immediate policy response in the United States imposed significant criminal and civil liability upon people living with HIV (Hoppe, 2018; Harsono, <em>et al.</em>, 2017; Sykes, <em>et al.</em>, 2016; Thrasher, 2015; Galletly, <em>et al.</em>, 2014; Lehman, <em>et al.</em>, 2014; Gagnon, 2012; Pollard, 2006; Gostin, <em>et al.</em>, 1999). Between 1986&ndash;2019, HIV-specific criminal laws and sentence enhancements applicable to people living with HIV have been enacted in 34 states and two U.S. territories (Center for HIV Law &amp; Policy, 2019; Lehman, <em>et al.</em>, 2014). Since 1986, these laws have criminalized nondisclosure of HIV and engagement in &ldquo;risky&rdquo; behaviors such as sexual activity, exposure to bodily fluids, needle sharing, sex work, blood/organ/semen donation, and, in a variety of instances, behaviors posing little, if any, risk of HIV transmission (Center for Disease Control and Prevention, 2019a; Center for HIV Law &amp; Policy, 2019).</p>
+
+<p>Despite claiming medical legitimacy for this punitive approach, researchers have long understood that the criminalization of HIV transmission was instead fueled by the associations between HIV and the gay community and communities of color (Hoppe, 2018; Gallo, 2006; Johnson, 1992; Banks, 1989) at a time when consensual sex between same-sex partners was a criminal offense in twenty-two states and over 61 percent of American evangelicals and 50 percent of non-evangelicals agreed with the statement &ldquo;I sometimes think AIDS is a punishment for the decline in moral standards&rdquo; (Gallup and Castelli, 1987).</p>
+
+<p>A significant body of empirical social science work documents the harmful effects HIV laws have had on the lives of people living with HIV (Barr&eacute;Sinoussi, <em>et al.</em>, 2018; Harsono, <em>et al.</em>, 2017; Sweeney, <em>et al.</em>, 2017; Adam, <em>et al.</em>, 2014). HIV criminalization both reinforces and magnifies HIV-related stigma and discrimination, reduces the willingness of persons at risk for HIV to get tested or seek care, and imperils demographic health collection of information (Harsono, <em>et al.</em>, 2017; Burris and Cameron, 2008; Galletly and Pinkerton, 2006; Elliot, 2002). A survey of over 2,000 people living with HIV in the U.S. revealed that at least 25 percent of respondents knew one or more individuals who were afraid to get tested for fear of facing criminalization (Sero Project, 2012). HIV criminalization also ignores the reality that successful antiretroviral therapy can render the level of the virus to undetectable, which, according to the National Institute of Health, means that HIV is then untransmittable (Eisinger, <em>et al.</em>, 2019).</p>
+
+<p>While HIV transmission was criminalized, other tools of control &mdash; in the form of surveillance &mdash; arose and were enforced. Early policy responses to HIV centered on overt surveillance and ostracism of those infected and perceived to be at risk (Fortin, 1995). This surveillance generally consists of disease reporting, sexual contact tracing, and data collection of people who have been diagnosed with HIV (Fan, 2012; 2011; Ward and Bell, 2014; Ward, 2005). The Center for Disease Control, for example, collects HIV data based on confidential name-based reporting laws implemented in all 50 states as of April 2008 (Center for Disease Control and Prevention, 2019b).</p>
+
+<p>HIV surveillance (and sexually transmitted infection surveillance more broadly) centralizes information and power in the state (Fairchild, <em>et al.</em>, 2007; Fan, 2012); because HIV intervention and surveillance is generally concentrated in lower income communities and health settings (McCree and Hogben, 2010), the most socially and economically marginalized communities bear the heaviest burden of HIV surveillance and its downstream consequences (Miller, <em>et al.</em>, 2004; Banks, 1989; Brandt, 1987). There is a long-racialized history of HIV, one that, in combination with the background racism of the United States, has led to the systemic undertreatment and under-consideration of communities of color (Ford, <em>et al.</em>, 2007; Anonymous, 2000; Johnson, 1992).</p>
+
+<p>This infrastructure of surveillance in turn reinforces the stigma of HIV, which has dramatic consequences for the likelihood of unwanted disclosure, access to care, psychiatric well-being, housing and employment discrimination, and, consequently, quality (or probability) of life (Lazarus, <em>et al.</em>, 2016; Mahajan, <em>et al.</em>, 2008). Coupled with the overarching stigma of HIV and its criminalization in various contexts, HIV surveillance offers a tool through which the state can identify citizens to be punished.</p>
+
+<p>In the era of &ldquo;big data&rdquo; and ubiquitous surveillance capitalism (Zuboff, 2019) &mdash; the private monetization of information about reality &mdash; HIV surveillance is not just in the hands of the state, but also in the hands of private organizations and individuals. In the context of widespread state surveillance and control and ongoing stigmatization of HIV, this opens yet more possibilities for harm through enabling the selling and redistribution of HIV status information, without the user&rsquo;s meaningful consent, to parties who may themselves engage in discrimination or direct violence.</p>
+
+<p>Many online platforms &mdash; including, as we trace out below, dating platforms &mdash; constitute not just spaces for the purposes outlined in their marketing materials but also tools for the police in tracing HIV status and criminalized behavior. In recent years, police have used technology to conduct Internet-based investigations for a similar purpose (POZ, 2015). Police now go undercover on Web sites and dating apps by creating fake identities online (Semitsu, 2011), and local law enforcement agencies and federal agencies increasingly employ these tactics in online investigations (Lichtblau and Arkin, 2014).</p>
+
+<p>Legal and public health scholars and advocates continue to call for a paradigm shift in managing HIV that leaves behind historical responses like surveillance, ostracism, and incarceration and accounts for the rise of the Internet and mobile technology and their impact on sexual attitudes and behaviors (Lehman, <em>et al.</em>, 2014; McCallum, 2014; Fan, 2011; Fenton, 2010). Since the criminalization of HIV, intimate platforms have become vital structures through which millions of people access the opportunity to engage in reciprocal romantic and sexual relationships (Hutson, <em>et al.</em>, 2018; Taylor, <em>et al.</em>, 2017; Rosenfeld and Thomas, 2012). By designing infrastructures for intimate affiliation, intimate platforms wield unmatched structural power to shape who meets whom and how within dating and sexual platforms (Hutson, <em>et al.</em>, 2018; Levy and Barocas, 2018; Emens, 2008; Robinson, 2007). These platforms frame the circumstances within which users understand each other as prospective romantic or sexual partners and shape social norms, sexual scripts, and relative advantages among users (Hardy and Lindtner, 2017; Kannabiran, <em>et al.</em>, 2012).</p>
+
+<p>The design of intimate platforms provides opportunities to explore new ways of managing HIV that reduce the concentration of power and information in the state (Fan, 2012). Through the role that platform design plays in shaping cultural norms, which has been identified as a more effective way of achieving HIV transmission prevention than flexing the punitive and surveillant arms of the state (Sunstein, 1996), intimate platform design provides opportunities to explore new ways of managing HIV (Fan, 2012). Indeed, a meta-analysis of HIV prevention efforts found that strategies that intervene in social meaning by shaping social norms, cultural practices, and individual attitudes were more effective in empowering behavioral change than appeals to fear (Albarracin, <em>et al.</em>, 2015).</p>
+
+<p>However, designing intimate platforms to account for HIV also presents serious challenges for social computing researchers and human-computer interaction (HCI) designers. As Handel and Shklovski pointed out: &ldquo;The minutiae of design decisions around profile options deserves particular attention because even the smallest changes can result in substantial differences for user interactions&rdquo; (Handel and Shklovski, 2012). In addition to concerns around how to best design for HIV, platforms, Grindr in particular, have already come under fire for sharing user HIV information with third parties (Singer, 2018). Moreover, designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the serious risk of re-entrenching the status quo and its incumbent inequalities and power relations (Bardzell, 2010). While designing for HIV presents opportunities to redress stigma and harm, researchers in HCI must understand that &ldquo;[i]t is not enough to have good intentions ... [we] must ground [our] efforts in clear political commitments and rigorous evaluations of the likely consequences&rdquo; (Green, 2018).</p>
+
+<p>From this comes the recognition that social computing designers and researchers seeking to design for disclosure cannot afford to ignore the ways that the lived experiences of people living with HIV are shaped by structural forces and, particularly, the reality of HIV criminalization and the State&rsquo;s role in conducting STD surveillance. Platforms, after all, do not exist in a separate sphere from material reality: a redesign that eases HIV disclosure from user-to-user might also involve the storing of disclosure data by the platform &mdash; data that can then be accessed, requisitioned, and co-opted by arms of the state. In line with Jackson, <em>et al.&rsquo;s</em> call for the social computing community to address the structural and lived consequences of law and policy that &ldquo;establish the very terrain on which design and practice can be conceived, articulated, and imagined &mdash; and upon which battles of accountability are inevitably waged&rdquo; <a name="2a"></a>[<a href="#2">2</a>], we wish to undertake a critical investigation of HIV disclosure in dating and hookup platforms. This involves not just investigating the implications of disclosure in a person-to-person sense, but also how platform design is shaped by legal and administrative regulation and how the risks of disclosure might open users up to systems of surveillance, stigma, and criminalization. We do so by using a range of platforms in an effort to gain a wide view, and to practice prefigurative politics &mdash; minimizing our assumptions about the &ldquo;type&rdquo; of people at risk of HIV infection and/or surveillance.</p>
+
+<p>To do this, we analyze platform&rsquo;s consequences for HIV through the lens of user-to-user interactions, exploring the ways that design renders users visible and vulnerable to wider carceral and surveillance infrastructures, and the way that design shapes (and is shaped) by HIV&rsquo;s legal status. We ground our discussion in a content analysis of 50 popular, mobile dating and hookup platforms, coding for design and policy choices related to HIV disclosure, prevention, destigmatization, surveillance, privacy, and criminalization. Through this, we reveal that many platforms fail to account for HIV, and of those that do, many neglect to attend to the downstream consequences of HIV disclosure and the data produced by it, while exacerbating the social, racial, and class stereotypes associated with the condition.</p>
+
+<p>As scholars and designers consider how platform design might aid HIV prevention and destigmatization (Hutson, <em>et al.</em>, 2018; Albury, <em>et al.</em>, 2017; Wohlfeiler, <em>et al.</em>, 2013; Rosser, <em>et al.</em>, 2011), we aim to grapple with the structural and ethical implications of designing for HIV, particularly how intimate platform design might aid and abet the decriminalization and surveillance of HIV (Sykes, <em>et al.</em>, 2016; Kazatchkine, <em>et al.</em>, 2015; Perone, 2013; Gagnon, 2012; J&uuml;rgens, <em>et al.</em>, 2009). Drawing on principles from social justice-oriented design to investigate controversies and design possibilities in intimate platforms, we attempt to articulate an approach to intimate platform design that not only works to reduce the stigma of user disclosure, but also works to contest historic and present power imbalances and injustices between users, platforms, and the state.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p2"></a>Methods</strong></p>
+
+<p>Using a directed content analysis (Hsieh and Shannon, 2005), we reviewed 50 existing mobile dating and hookup platforms. Content analyses have proven effective in understanding platform design and governance and the ways design practices mediate user-to-user bias and discrimination (Levy and Barocas, 2018; Hutson, <em>et al.</em>, 2018). We set out to capture a landscape of popular platforms and selected the first 50 dating and hook up platforms in the top 200 grossing social networking applications in the United States on the iOS App Store in March of 2019. <a href="#fig1">Figure 1</a> lists the platforms selected in alphabetical order.</p>
+
+<p>&nbsp;</p>
+<a name="fig1"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71623" alt="50 dating and hookup platforms surveyed"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 1:</strong> The 50 dating and hookup platforms surveyed.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>Utilizing the walkthrough method (Light, <em>et al.</em>, 2018), we explored each platform&rsquo;s HIV-related user experience. We examined design features on each of these platforms, systematically documenting design choices, policies, and informational interventions that mediate HIV. Building upon previous work around intimate platforms and HIV, we coded each of the 50 intimate platforms based on the following dimensions:</p>
+
+<table width="70%" align="center"><tr><td><p>Prevention</p>
+<ul><li>Whether the app allows same-sex connections</li>
+<li>Whether a user can disclose HIV/sexually transmitted infection (STI) status (Warner, <em>et al.</em>, 2018)</li>
+<li>If they can disclose, what are the options? (Warner, <em>et al.</em>, 2018)</li>
+<li>Whether a user can search for or filter out users with HIV/STIs? (Hutson, <em>et al.</em>, 2018)</li>
+<li>Whether the platforms provide informational interventions with respect to HIV/STI prevention (Wang, <em>et al.</em>, 2019)</li></ul>
+<p>Stigma reduction</p>
+<ul><li>Whether a user can identify as having HIV/STI (<em>e.g.</em>, &ldquo;Poz&rdquo;, etc.)</li>
+<li>Whether a user can indicate interest in or acceptance of people living with HIV/STIs (<em>e.g.</em> outward presentation, separate from filtering, not simply via profile text) (Hutson, <em>et al.</em>, 2018)</li></ul>
+<p>Policies</p>
+<ul><li>Whether the platform engages HIV/STIs in their policies (terms of service, privacy, and community policies, etc.) (Jackson, <em>et al.</em>, 2014)</li></ul></td></tr></table>
+
+<p>For ethical reasons, we did not interact with other users, only observed features, and deleted our accounts once data were collected when possible (not all platforms allowed for account deletion). The design and policy choices described and discussed below are not intended as an endorsement of any particular design intervention for managing HIV. Rather, we aim to capture the various ways intimate platforms currently manage and mediate HIV among users and how those choices map onto extant legal and surveillant infrastructures. Additionally, we highlight two limitations in how we chose which platforms to analyze. First, it is possible for a hook-up platform to not have an accompanying mobile app, meaning our selection of platforms from the iOS app store will have invariably missed Web site-based platforms. Second, we may have overlooked platforms that are more niche or community-specific, yet not as popular in the broader platform marketplace (<em>i.e.</em>, not within the top grossing platforms).</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p3"></a>Findings</strong></p>
+
+<p>&nbsp;</p>
+<a name="fig2"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71624" alt="A visualization of our content analysis"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 2:</strong> A visualization of our content analysis.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Design features</strong></em></p>
+
+<p>Out of the 50 intimate platforms we examined, 13 were meant specifically for queer communities (11 specifically targeted at gay and bisexual men and two at lesbian and bisexual women). None of the platforms we reviewed were distinctly designed for trans people. The remaining 34 platforms were for general audiences, catering to heterosexual and homosexual connections, and three platforms were exclusively for heterosexual connections (eHarmony, Uniform Dating, and Waplog). Only queer-specific platforms (six) had explicit HIV disclosure options and allowed for filtering or searching based on HIV status. <a href="#fig3">Figure 3</a> shows the disclosure options for each platform. Growlr, Taimi, and Scruff allowed users to indicate that they were accepting of people living with HIV. Grindr, Hornet, Mr. X, Xtremboy, and Scruff, five platforms all of which are queer-specific, provide informational interventions with respect to HIV/STI prevention (See <a href="#fig4">Figure 4</a> for examples). Eight dating apps mentioned HIV in their policies (five queer-specific, three general). Four dating apps allowed users to identify with an HIV/STI-relevant identity category, often labeled &ldquo;poz&rdquo;. Please see <a href="#fig2">Figure 2</a> for a visualization of our content analysis.</p>
+
+<p>&nbsp;</p>
+<a name="fig3"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71625" alt="Disclosure options"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 3:</strong> Disclosure options.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>&nbsp;</p>
+<a name="fig4"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71626" alt="Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right)"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 4:</strong> Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right).</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Policies</strong></em></p>
+
+<p>None of the 50 intimate platforms we reviewed explicitly mention HIV in their terms of service. Four platforms expressly discuss HIV in their privacy policies (Grindr, Hornet, Scruff, and Mr. X), and four platforms mention HIV in platform safety policies (Planet Romeo, Tinder, BlackPeopleMeet, and Our Time). No platform engaged any of the legal implications of HIV. No platform engaged the public health surveillance of HIV.</p>
+
+<p>Of the four platforms that expressly engage HIV in their privacy policies (Grindr, Hornet, Mr. X, Scruff), only two (Grindr &amp; Hornet) explicitly prohibit sharing HIV information with third parties. By disclosing one&rsquo;s HIV status on Mr. X and Scruff, users consent to the platform&rsquo;s processing of that information. Grindr warns that HIV status disclosure on a user profile is effectively public information, however the platform does not share HIV status information with third party tracking, analytics, and advertising companies or service providers. Of all the platforms reviewed, Grindr&rsquo;s privacy policy is the only one that devotes an entire section to HIV status, which is not particularly surprising given Grindr&rsquo;s involvement in multiple controversies around sharing HIV information with third parties (Fitzsimons, 2019; Singer, 2018):</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;HIV Status. At the recommendation of HIV prevention experts and the community of Grindr users, we give you the option of publishing your health characteristics, such as your HIV status, in your Grindr community profile. Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App. As a result, you should carefully consider whether you want to disclose your HIV status. We do not share HIV status with any third-party service advertisers or third-party service providers other than companies that host data on our behalf (<em>e.g.</em>, Amazon Cloud). In addition, we do not use HIV status for advertising purposes and do not share this information with advertisers.&rdquo;</td></tr></table>
+
+<p> According to Hornet&rsquo;s privacy policies, they &ldquo;[do] not share any HIV status information with third parties unless required to do so by law&rdquo;. Of the 50 platforms reviewed, Hornet was the only one to enable users to opt into receiving &ldquo;in-app reminders to undergo HIV tests and receive information on the location of nearby testing centers.&rdquo; On Hornet, a user&rsquo;s HIV status &ldquo;is only searchable by users who have defined themselves as HIV positive.&rdquo; Scruff&rsquo;s privacy policy highlights that &ldquo;there is no requirement to&rdquo; provide them with &ldquo;health details and whether part of the POZ (HIV positive) community (for example, in creating or updating your profile),&rdquo; and that by doing so, users &ldquo;are explicitly consenting to [Scruff&rsquo;s] processing of [their] information.&rdquo; Mr. X&rsquo;s privacy policy notes that HIV status information &ldquo;may be considered &lsquo;special&rsquo; or &lsquo;sensitive&rsquo; in certain jurisdictions,&rdquo; and that by providing this information, users &ldquo;consent to [Mr. X&rsquo;s] processing that information&rdquo;.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p4"></a>Discussion</strong></p>
+
+<p><em><strong>Prevention</strong></em></p>
+
+<p>Platforms can act as an interventional tool to improve access to and perceptions of care for people living with HIV. Examples of HIV/STI prevention include a &ldquo;Last Tested Date&rdquo; section on a user&rsquo;s profile and reminders to get tested for HIV/STIs. Some current platforms engage with HIV more critically by acknowledging that HIV is an issue its users should be aware through specific features. Hornet, for instance, provides its users with HIV-relevant educational material and resources for getting tested. Hornet also limits searching based on HIV status to people who themselves have chosen the HIV positive option, thereby limiting the possibility of HIV status-based discrimination. Hornet and Grindr can also provide reminders for users to get tested. Scruff allows users to choose from sex safety practices that include using condoms, using pre-exposure prophylaxis (PrEP), and/or treatment as prevention (Warner, <em>et al.</em>, 2019).</p>
+
+<p>Due in large part to the history of HIV&rsquo;s recognition as a medical condition, HIV has been generally classified as a &ldquo;gay man&rsquo;s problem&rdquo; in North America &mdash; frequently (albeit almost as frequently unmarked) a white, cisgender gay man&rsquo;s problem. This classification and framing acted to both separate normative society from the stigma associated with the condition and provide an avenue for activism by associating it with the most &ldquo;acceptable&rdquo; queer bodies: masculine, middle class, cisgender and white (Epstein, 1996).</p>
+
+<p>HIV has disproportionately impacted gay communities specifically, but transmission does not fit a neat pattern of being binarized tidily along sexuality. It is disproportionately prevalent in communities of color, appears in heterosexual relationships and lives, and risk of transmission follows societal vulnerability and marginalization &mdash; transgender women, particularly transgender women of color, are particularly overrepresented in diagnosis rates (Clark, <em>et al.</em>, 2017). While the partial normalization of HIV &mdash; holding it outside the direct concerns of white, cisgender, heterosexual people, but embodying it in people who look &ldquo;just like them&rdquo; &mdash; may have aided in assembling efforts to address the condition, the assumptions that it has created in who is at risk and who &ldquo;counts&rdquo; have been tremendous. One only has to look at the ethnographic work of Vivianne Namaste, who highlights how Montreal&rsquo;s history of HIV, its recognition, and efforts at its prevention simultaneously elided the incidence rate amongst the Haitian community (which at one point had 65 percent of reported AIDS cases) and lacked any advice or conception of susceptibility for women, particularly heterosexual or bisexual women (Namaste, 2015).</p>
+
+<p>Our platform analysis demonstrates that these same assumptions about vulnerability and risk are present in the design of intimate platforms. Generic platforms (<em>i.e.</em>, those that cater to non-queer or broader, more heteronormative audiences) entirely do not consider, engage, or design for HIV while the platforms for queer &mdash; and more specifically gay men &mdash; do. Even within the group of 13 queer-specific applications, neither of the two queer women-specific apps allowed for HIV disclosure, even though 23 percent of people with HIV in the U.S. are women (Center for Disease Control and Prevention, 2019c). Most, if not all, platforms dedicated to general audiences do nothing when it comes to HIV prevention, contributing to the knowledge gap for general audiences on sexual health, HIV-specific, and more. Because general audiences can go through online dating experiences without encountering HIV materials, platform designers allow these users to falsely believe that their sexual lives are excluded from important matters of sexual health.</p>
+
+<p>Our intent is not to suggest that HIV should be narrated as a problem for everyone; to ignore sexuality in the impact and risk of HIV transmission is an ahistorical mistake. But treating it <em>solely</em> as a &ldquo;gay man&rsquo;s problem&rdquo; simultaneously elides differences in vulnerability and risk within gay communities and perpetuates the silence around transmission for other populations, particularly trans women of color and/or heterosexual people. In other words, it is not that HIV is not frequently a risk for gay communities, but that drawing a line between sexuality and risk perpetuates the more nuanced disparities in risk and the discourse that HIV transmission is not something anyone else has to think about.</p>
+
+<p>Platforms can and have implemented prevention efforts through Last Tested Date and Testing Reminders features. Doing so more ubiquitously, rather than solely on gay male-specific platforms, may be helpful in normalizing prevention efforts like getting tested regularly and knowing one&rsquo;s status. Through opportunities like this, platform designers have the opportunity to promote HIV/STI prevention and care &mdash; an opportunity that is valuable precisely for its ability to normalize prevention efforts. This is not to say that such features are not without risks, particularly with regards to state surveillance, intervention and structural forces, which is our next topic of concern and discussion.</p>
+
+<p><em><strong>Stigma &amp; disclosure</strong></em></p>
+
+<p>Designing for HIV is not as simple as including disclosure fields and status-based filtering or not. Allowing disclosure and filtering can protect people living with HIV from negative and sometimes harmful interactions, help filter out people who might discriminate against them, fight HIV stigma, and promote much-needed awareness. However, disclosure and filtering can also lead to discriminatory practices (Hutson, <em>et al.</em>, 2018), have potential for privacy unraveling (Warner, <em>et al.</em>, 2018), and contribute to surveillance (Fan, 2012, 2011).</p>
+
+<p>De-stigmatizing HIV offers designers an opportunity to engage in the structural dimensions of how HIV operates in social life and can possibly allow us to better tap into social norms around the condition that ultimately improve other outcomes. For instance, humanizing people living with HIV could lead to more people getting tested, being open about their status, and being communicative with their sexual partners. Platforms have the power to shift social norms and destigmatize HIV at scale due to their pervasiveness throughout modern connections, but designers need to contest the ethical implications of de-stigmatizing HIV on these platforms, especially through current features such as HIV-status-based filtering and disclosure options.</p>
+
+<p>Filtering and searching tools based on HIV status can be instrumental for people living with HIV to find others who are either seropositive or otherwise accepting of seropositive people. Additionally, filtering out those who might discriminate against them for their HIV status anyways allows people living with HIV to avoid awkward or even violent interactions with users who harbor problematic beliefs about people living with HIV. Conversely, HIV status-based filtering and searching tools have representational and allocational harms. First, it represents that there are particularly psycho-social characteristics incumbent with HIV status. These stereotypes play out in a variety of different ways such as the framing that people living with HIV engage in &ldquo;risky&rdquo; sexual behavior. Second, HIV status-based filtering can be used to structurally exclude HIV positive users from the opportunity to engage in intimate affiliation (Hutson, <em>et al.</em>, 2018). Platforms can and do provide users the ability to screen out other users who identify as &ldquo;Poz&rdquo; or disclose their HIV status. Not only do these design features facilitate exclusion, they may disincentivize HIV related disclosures to the extent that such disclosures can be weaponized by other users to exclude them as potential intimate affiliates.</p>
+
+<p>Disclosure fields as a way to de-stigmatize HIV are similarly complicated in that they can inhibit and benefit people living with HIV. For one, encouraging users to disclose, regardless of their status, can create a healthier culture and discussion around HIV, possibly making talking about one&rsquo;s status an acceptable and common practice of intimate engagement. On the other hand, disclosure can be used for a variety of problematic ends that harm seropositive users. Other users may discriminate against users who have disclosed their HIV status, choosing to ignore or disengage with them entirely. Disclosure may have unintended consequences and lead to more personal and violent outcomes. Due to laws in particular jurisdictions, failure to disclose one&rsquo;s status to a partner can lead to prosecution and potentially incarceration. People living with HIV might also face physical and emotional threats for disclosing their status either publicly or privately.</p>
+
+<p>Due to these complexities, designers of dating platforms must face the question of how can we de-stigmatize HIV without creating additional obstacles for people living with HIV? Platforms need to critically unpack the possible consequences of well-intentioned design choices, including HIV status-based filtering and HIV status disclosure fields. Of the platforms we reviewed, Scruff is the only one to provide for HIV disclosure without using an express &ldquo;HIV status&rdquo; field, allowing instead two disclosure options, Poz and Treatment as Prevention. &ldquo;Poz&rdquo; constitutes an association and identification with a community (<em>e.g.</em>, &ldquo;I am a bear, daddy, poz&rdquo;), while &ldquo;Treatment as Prevention,&rdquo; signals antiretroviral therapy (<em>i.e.</em>, use of HIV medicines to treat HIV infection) and constitutes a link to sex safety practices.</p>
+
+<p><em><strong>Surveillance &amp; criminalization</strong></em></p>
+
+<p>At the same time, given the questions of structural power and surveillance built into these platforms, we are leery of treating disclosure option design as the site of de-stigmatization and justice. Questions of privacy and stigma go wider than micro-interactions and touch on how HIV is seen and responded to societally and administratively. The dominant responses to HIV/AIDS &ldquo;center on adjusting the traditional levers of criminal and tort law, and of public health law, with its surveillance and disciplinary regimes that concentrate information and decision-making in the state&rdquo; <a name="3a"></a>[<a href="#3">3</a>]. Indeed, HIV continues to function as a &ldquo;vector for the exercise of state power and the invention of novel logics and techniques of government,&rdquo; whereby &ldquo;[i]nfection with HIV virtually guarantees that a citizen will need to interact, either beneficently or coercively, with one or more state bureaucracies&rdquo; <a name="4a"></a>[<a href="#4">4</a>].</p>
+
+<p>The broader ecosystem of intimate platforms that we observed provided virtually no HIV-specific privacy information or protections for users living with HIV. Overall, both the platforms that account for HIV in their privacy policies and the platforms that enable disclosure but do not account for HIV in their privacy policies continue to place the risks and burden of surveillance, privacy, and disclosure on users with HIV. Grindr&rsquo;s &ldquo;HIV Status&rdquo; policy puts it clearly: &ldquo;Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App.&rdquo; By surfacing this as a risk we do not mean to suggest that users lack agency &mdash; merely that the agency to choose between a range of options can be influenced by how those options are bounded and made available in addition to the affordances and norms that platform design provides. That a user makes information public does not mean that &ldquo;consumable by all&rdquo; is the framework of disclosure that they have in mind (Wittkower, 2016).</p>
+
+<p>While some intimate platforms are working towards promoting HIV disclosure, prevention, and de-stigmatization, they are also failing to grapple with privacy implications of HIV and their responsibility in ensuring it. People living with HIV are already vulnerable and bear the weight of HIV disclosure&rsquo;s downstream consequences. By continuing to offload the burdens and risk on those with HIV, platforms are likely contributing to issues of nondisclosure as well as HIV testing. Research shows that privacy fears can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002).</p>
+
+<p>In this context, proposals to design for HIV disclosure that do not consider the wider structural implications of surveillance are concerning. The focus of most research into HIV and online dating in HCI on micro-interactions and enabling trust and certainty between users elides the implications that providing this data to a platform outside user control has and the way that this data can be used to control. This is not an abstract risk; just this year, Grindr (the platform under study) has been the subject of scrutiny by the U.S. government over its Chinese ownership, due to fears that the Chinese government might access and copy Grindr&rsquo;s data around HIV disclosure for the purpose of domestic policing and control (Fitzsimons, 2019). If we are designing to enable HIV disclosure, are we working to improve stigma associated with disclosure &mdash; or are we enabling new forms of control and surveillance?</p>
+
+<p>In the United States today, intimate platforms operate within 29 states that have HIV criminal laws, which include laws that target sex/nondisclosure of HIV-positive status, sex work, exposure to bodily fluids, needle-sharing, sex work, and blood/organ/semen donation, nine states that have sentencing enhancements applicable to people living with HIV who commit an underlying assault crime, and 24 states that have prosecuted people living with HIV under non-HIV-specific general criminal laws (Center for HIV Law &amp; Policy, 2019). Here, the design of intimate platforms cannot be removed from the reality of laws that criminalize HIV, particularly HIV non-disclosure.</p>
+
+<p>People living with HIV in the U.S. with HIV-specific criminal laws must disclose their HIV status to sexual partners. Generally, &ldquo;disclosure and consent&rdquo; is an affirmative defense <a name="5a"></a>[<a href="#5">5</a>], whereby a person can avoid criminal and civil liability if they disclose their serostatus <a name="6a"></a>[<a href="#6">6</a>] and their sexual partner voluntarily consents to sexual activity with knowledge of that serostatus <a name="7a"></a>[<a href="#7">7</a>]. Many of the laws that criminalize HIV non-disclosure do not provide guidance as to what methods of disclosure and consent are enough to avoid prosecution and conviction (McCallum, 2014). No court or legislature has affirmatively stated whether verbal disclosure and consent are necessary under criminal HIV transmission statutes. Furthermore, non-verbal communication online create uncertainty as to whether there is sufficient disclosure and consent to remove criminal liability for HIV-positive individuals. Both disclosure and consent can be ambiguous or misunderstood, a problem that is complicated by the design and widespread use of mobile dating and hookup platforms.</p>
+
+<p>It remains unclear what constitutes appropriate disclosure and informed consent in the context of intimate platforms, such as HIV disclosure fields on user profiles or other communication in a profile&rsquo;s free form text sections (<em>e.g.</em>, &ldquo;+&rdquo; &ldquo;Poz&rdquo;, &ldquo;undetectable&rdquo;). Although some intimate platforms afford HIV-positive users the ability to disclose their serostatus in new ways, no court or legislature in the U.S. has answered whether disclosing HIV status on an intimate platform is enough to achieve informed consent and avoid criminal and civil liability. Yet many people living with HIV also use records of conversations on intimate platforms as a means of protection. For example, people disclose their status and use that record as a way to protect themselves from future allegations of non-disclosure. This ambiguity and incumbent legal risk places significant responsibility and pressure on HIV users. Research shows that fears around rejection, self-blame, criminalization, and privacy can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002). Privacy concerns around HIV disclosure are often associated with the need to protect one&rsquo;s self from HIV related stigma (Adam, <em>et al.</em>, 2011; Serovich and Mosack, 2006; Greene, <em>et al.</em>, 2003). As more and more people use platforms to meet intimate partners, the historical failure of HIV criminalization law to understand how disclosure and consent are negotiated in practice becomes all the more apparent.</p>
+
+<p>It might seem from this that designers and developers are trapped in an impossible situation &mdash; disclosure to protect users simultaneously produces the possibility of structural harms for those disclosing. While we urge designers to take both needs seriously, we do not consider it impossible; in fact, there is a range of work within queer theory and technology that not only articulates this tension of privacy, disclosure and the reuse of data but suggests queer forms of resistance to it. Writing more broadly, Brian Schram highlights the way that the increasing possibilities of &ldquo;big data&rdquo; and its attendant surveillance structures &ldquo;constitute an undoing of Queerness as a radical political injection&rdquo; <a name="8a"></a>[<a href="#8">8</a>], advocating a politics of <em>melancholia</em> that features a haunting of archives: an insertion of the dead weight of our collective memory as Queer persons into the growing catalog of our digital information. In other words, Schram suggests the deliberate incorporation of masses of false data, profiles, and traces into data stores in order to render ambiguous the truth of any presence and provide cover for those queer persons existing within the platform(s) data highlights. What would this look like in the case of dating platforms? What are the possibilities raised by incorporating a deluge of false accounts, <em>doppelg&auml;ngers</em>, and doubles, not as a deception of the platform or its users, but against state forces examining the database?</p>
+
+<p>More broadly, we might see possibilities for the future through practices in the past. In how queer communities responded to HIV disclosure and protection protocols during the 1980s and 1990s, David Halperin has articulated the way that gay communities worked to articulate norms that balanced risks, trust, and vulnerability in the absence of structural norms, that &ldquo;it is gay men themselves who have continued to define, and to redefine, the limits of safety through an ongoing history of sexual experimentation and mutual consultation, and who have thereby produced, over time, workable compromises and pragmatic solutions that balance safety and risk&rdquo; <a name="9a"></a>[<a href="#9">9</a>]. Rather than taking universalized, top-down approaches to platform design for all, we might instead seek to work up and to create a diverse range of spaces that challenge the ease of surveillance built into large-scale platforms and afford individual users more agency in establishing those compromises and solutions and engaging in that consultation.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p5"></a>Conclusion</strong></p>
+
+<p>As HCI researchers and designers, we continue to push the boundaries of what is technologically possible but doing so requires us to first ask whether platform design is even an appropriate intervention in a given situation (Keyes, <em>et al.</em>, 2019; Baumer and Silberman, 2011; Suchman, 2011). The current model of platform design for HIV cannot continue, as it is too closely tied to the collection and commodification of highly sensitive personal data. However, reimagining intimate platform design provides the social computing community an opportunity to intervene in the social norms around HIV and HIV disclosure in a manner that could unburden the weight of criminalization without centralizing the surveillant arms of the state.</p>
+
+<p>We envision a future of dating platforms that does not force people living with HIV to sacrifice surveillance for intimate experiences. Because of their entanglements with sex and romance, intimate platforms need to take on more responsibility in the sexual health and data privacy of their users. Drawing from our analysis and our own lived experiences, we recommend platform-level changes, changes in platform, and mechanisms to prevent platforms from knowing their users&rsquo; statuses. First, platforms should make explicit to their users the consequences of storing sensitive, personal information like HIV status and their documentation processes. Next, they should also implement policies that manage how data are stored when users delete their accounts and protect these data from third-party consumers. Finally, ownership of user&rsquo;s data should belong to the users themselves, rather than the platforms. Users should be able to pass along their information to other users without the platforms tracking it.</p>
+
+<p>HIV is a medical condition, but its eradication requires not just technical, or even sociotechnical, but socio<em>political</em> solutions. Indeed, the ways in which designers and policy-makers frame HIV is an inherently political decision, one that will impose the contours and boundaries of our response. The social computing community cannot do nothing, but it also must resist the desire to do everything. Designing user interfaces and platform policies to account for HIV will require a rigorous analysis of possible outcomes and consequences as well as a bedrock commitment to centering the voices and experiences of those impacted by HIV and the state&rsquo;s responses to it. Our commitments must account for the ways pathology and power intertwine to subjugate and otherize impacted communities at home and abroad.</p>
+
+<p>Designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the risk of re-entrenching the status quo and its incumbent inequalities and power relations (Dombrowski, <em>et al.</em>, 2016; Light, 2011; Irani, <em>et al.</em>, 2010; Bardzell, 2010). The social computing community must ground its efforts to design for HIV in clear political commitments to decriminalizing HIV and decentralizing power and information from the state. We must strive to unburden the weight of surveillance and incarceration on vulnerable and marginalized communities and work towards offloading the significant social and legal risks and pressures for people living with HIV. Moreover, our commitment to designing for HIV must not exclude nor obfuscate our capacity for direct action within and outside of the realms of design and research. This means fighting for the rights, dignity, and safety of people living with HIV in the streets and in the halls of local, national, and international political, legislative, and executive bodies.</p>
+
+<p>Our instinctual response to the failed and violent efforts of HIV criminalization and surveillance should not be &ldquo;there&rsquo;s an app for that,&rdquo; but rather &ldquo;there&rsquo;s a zap for that!&rdquo;. That is, the practice of designing for people with HIV should be a &ldquo;critical technical practice&rdquo; (Agre, 1997), undertaken with a mindset that sits uneasily between and is cognizant of both individual and structural power and consequence. Pioneered by the American gay liberation movement, a zap or &ldquo;zap action&rdquo; is a political action of direct and persistent public confrontation. Whether shouting down public figures or smashing pies into the faces of evangelicals, zaps aim to disrupt and disturb persons and institutions of authority to effect change (Cohen, 2018). In the words of AIDS Coalition to Unleash Power&rsquo;s (ACT UP) &ldquo;New Member Packet&rdquo;:</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;Zaps are a method for ACT UP members to register their disapproval of and anger toward the zap target. Zaps usually have more specific targets than actions. Because of this focus, numerous zapping techniques have been developed. ACT UP zaps individuals or organizations by: sending postcards or letters; invading offices and distributing fact sheets; sending (lots and lots of) faxes; picketing; outraged (and sometimes outrageous) phone calls. The more zappers who zap the zappee the better the zap.&rdquo;</td></tr></table>
+
+<p>A critical approach to designing for HIV requires the contesting of histories of incarceration, stigmatization, and surveillance and the ways in which the state exerts power and domination through its medicolegal levers of criminal law and public health surveillance. Intimate platform design should not only work to reduce the prevalence and stigma of HIV, but also to contest historic and present power imbalances and injustices between users, platforms, and the state. <img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71628" alt="End of article"></p>
+
+<p>&nbsp;</p>
+<a name="author"></a>
+<p><strong>About the authors</strong></p>
+
+<p><strong>Calvin Liang</strong> is a Ph.D. student in Human-Centered Design and Engineering Department at the University of Washington. His research broadly focuses on technology&rsquo;s role in and out of queerness, health, and queer health.<br>E-mail: cliang02 [at] uw [dot] edu</p>
+
+<p><strong>Jevan Alexander Hutson</strong>, living with HIV for four years, is a technology policy advocate, human-computer interaction researcher, and J.D. candidate at the University of Washington School of Law. His research interests center on issues of technology, law, and social life, with a particular focus on intimate/sexual computing.<br>E-mail: jevanh [at] uw [dot] edu</p>
+
+<p><strong>Os Keyes</strong> is a Ph.D. student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.<br>E-mail: okeyes [at] uw [dot] edu</p>
+
+<p>&nbsp;</p>
+<p><strong>Acknowledgements</strong></p>
+
+<p>We dedicate this paper to the radical history of the AIDS Coalition to Unleash Power (ACT UP) and to all of the souls we&rsquo;ve lost and continue to lose to HIV/AIDS. We would like to thank Mary Fan, Sean Munson, and Julie Kientz for valuable conversations and feedback, and Margret Wander and Margaret Hopkins for their ongoing care and support. This research was partially funded by a Microsoft Ada Lovelace Fellowship.</p>
+
+<p>&nbsp;</p>
+<p><strong>Notes</strong></p>
+
+<p><a name="1"></a><a href="#1a">1.</a> Halperin and Hoppe, 2017, p. 349.</p>
+
+<p><a name="2"></a><a href="#2a">2.</a> Jackson, <em>et al.</em>, 2014, p. 596.</p>
+
+<p><a name="3"></a><a href="#3a">3.</a> Fan, 2011, p. 36.</p>
+
+<p><a name="4"></a><a href="#4a">4.</a> Halperin and Hoppe, 2017, p. 255.</p>
+
+<p><a name="5"></a><a href="#5a">5.</a> See FLA. STAT. ANN. &sect; 775.0877 (2017) (&ldquo;[I]t is an affirmative defense to a charge of violating this section that the person exposed knew that the offender was infected with HIV, knew that the action being taken could result in transmission of the HIV infection, and consented to the action voluntarily with that knowledge.&rdquo;). See also <a href="http://www.hivlawandpolicy.org/states/florida">http://www.hivlawandpolicy.org/states/florida</a>.</p>
+
+<p><a name="6"></a><a href="#6a">6.</a> Serostatus is defined as: &ldquo;The state of either having or not having detectable antibodies against a specific antigen, as measured by a blood test (serologic test). For example, HIV seropositive means that a person has detectable antibodies to HIV; seronegative means that a person does not have detectable HIV antibodies.&rdquo; U.S. Department of Health &amp; Human Services, Education Materials, AIDSINFO, at <a href="https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus" target="_blank">https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus</a>, accessed 30 August 2019.</p>
+
+<p><a name="7"></a><a href="#7a">7.</a> Lehman, <em>et al.</em>, 2014, p. 1,101.</p>
+
+<p><a name="8"></a><a href="#8a">8.</a> Schram, 2019, p. 611.</p>
+
+<p><a name="9"></a><a href="#9a">9.</a> Halperin, 2015, p. 207.</p>
+
+<p>&nbsp;</p>
+<p><strong>References</strong></p>
+
+<p>Barry D. Adam, Richard Elliott, Patrice Corriveau, and Ken English, 2014. &ldquo;Impacts of criminalization on the everyday lives of people living with HIV in Canada,&rdquo; <em>Sexuality Research and Social Policy</em>, volume 11, number 1, pp. 39&ndash;49.<br>doi: <a href="https://doi.org/10.1007/s13178-013-0131-8" target="_blank">https://doi.org/10.1007/s13178-013-0131-8</a>, accessed 5 September 2020.</p>
+
+<p>Barry D. Adam, James Murray, Suzanne Ross, Jason Oliver, Stephen G. Lincoln, and Vicki Rynard, 2011. &ldquo;Hivstigma.com, an innovative Web-supported stigma reduction intervention for gay and bisexual men,&rdquo; <em>Health Education Research</em>, volume 26, number 5. pp. 795&ndash;807.<br>doi: <a href="https://doi.org/10.1093/her/cyq078" target="_blank">https://doi.org/10.1093/her/cyq078</a>, accessed 5 September 2020.</p>
+
+<p>Philip E. Agre, 1997. &ldquo;Toward a critical technical practice: Lessons learned in trying to reform AI,&rdquo; In: Geof Bowker, Les Gasser, Leigh Star, and Bill Turner (editors). <em>Bridging the great divide: Social science, technical systems, and cooperative work</em>. Mahwah, N.J.: Erlbaum.</p>
+
+<p>Anonymous, 2000. &ldquo;Name brands: The effects of intrusive HIV legislation on high-risk demographic groups,&rdquo; <em>Harvard Law Review</em>, volume 113, number 8, pp. 2,098&ndash;2,113.<br>doi: <a href="https://doi.org/10.2307/1342321" target="_blank">https://doi.org/10.2307/1342321</a>, accessed 5 September 2020.</p>
+
+<p>Taunya Lovell Banks, 1989. &ldquo;Women and AIDS &mdash; Racism, sexism, and classism,&rdquo; <em>New York University Review of Law &amp; Social Change</em>, volume 17, pp. 351&ndash;385, and at <a href="ttps://digitalcommons.law.umaryland.edu/fac_pubs/328" target="_blank">ttps://digitalcommons.law.umaryland.edu/fac_pubs/328</a>, accessed 5 September 2020.</p>
+
+<p>Shaowen Bardzell, 2010. &ldquo;Feminist HCI: Taking stock and outlining an agenda for design,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,301&ndash;1,310.<br>doi: <a href="https://doi.org/10.1145/1753326.1753521" target="_blank">https://doi.org/10.1145/1753326.1753521</a>, accessed 5 September 2020.</p>
+
+<p>Fran&ccedil;oise Barr&eacute;Sinoussi, Salim S. Abdool Karim, Jan Albert, LindaGail Bekker, Chris Beyrer, Pedro Cahn, Alexandra Calmy, Beatriz Grinsztejn, Andrew Grulich, Adeeba Kamarulzaman, Nagalingeswaran Kumarasamy, Mona R. Loutfy, Kamal M. El Filali, Souleymane Mboup, Julio S.G. Montaner, Paula Munderi, Vadim Pokrovsky, AnneMieke Vandamme, Benjamin Young, and Peter GodfreyFaussett, 2018. &ldquo;Expert consensus statement on the science of HIV in the context of criminal law,&rdquo; <em>Journal of the International AIDS Society</em>, volume 21, number 7.<br>doi: <a href="https://doi.org/10.1002/jia2.25161" target="_blank">https://doi.org/10.1002/jia2.25161</a>, accessed 5 September 2020.</p>
+
+<p>Eric P.S. Baumer and M. Six Silberman, 2011. &ldquo;When the implication is not to design (technology),&rdquo; <em>CHI &rsquo;11: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 2,271&ndash;2,274.<br>doi: <a href="https://doi.org/10.1145/1978942.1979275" target="_blank">https://doi.org/10.1145/1978942.1979275</a>, accessed 5 September 2020.</p>
+
+<p>Allan M Brandt, 1987. <em>No magic bullet: A social history of venereal disease in the United States since 1880</em>. Expanded edition. Oxford: Oxford University Press.</p>
+
+<p>Scott Burris and Edwin Cameron, 2008. &ldquo;The case against criminalization of HIV transmission,&rdquo; <em>Journal of the American Medical Association</em>, volume 300, number 5, pp. 578&ndash;581.<br>doi: <a href="https://doi.org/10.1001/jama.300.5.578" target="_blank">https://doi.org/10.1001/jama.300.5.578</a>, accessed 5 September 2020.</p>
+
+<p>Center for Disease Control and Prevention, 2019a. &ldquo;HIV and STD criminal laws,&rdquo; at <a href="https://www.cdc.gov/hiv/policies/law/states/exposure.html" target="_blank">https://www.cdc.gov/hiv/policies/law/states/exposure.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019b. &ldquo;HIV surveillance reports,&rdquo; at <a href="https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html" target="_blank">https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019c. &ldquo;HIV and women,&rdquo; at <a href="https://www.cdc.gov/hiv/group/gender/women/" target="_blank">https://www.cdc.gov/hiv/group/gender/women/</a>, accessed 5 September 2020.</p>
+
+<p>Center for HIV Law &amp; Policy, 2019. &ldquo;HIV criminalization in The United States,&rdquo; at <a href="http://www.hivlawandpolicy.org/sourcebook" target="_blank">http://www.hivlawandpolicy.org/sourcebook</a>, accessed 2 February 2020.</p>
+
+<p>Hollie Clark, Aruna Surendera Babu, Ellen Weiss Wiewel, Jenevieve Opoku, and Nicole Crepaz, 2017. &ldquo;Diagnosed HIV infection in transgender adults and adolescents: Results from the National HIV Surveillance System, 2009&ndash;2014,&rdquo; <em>AIDS and Behavior</em>, volume 21 number 9, pp. 2,774&ndash;2,783.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1656-7" target="_blank">https://doi.org/10.1007/s10461-016-1656-7</a>, accessed 5 September 2020.</p>
+
+<p>Sascha Cohen, 2018. &ldquo;How gay activists challenged the politics of civility,&rdquo; <em>Smithsonian Magazine</em> (10 July), at <a href="https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/" target="_blank">https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2004. &ldquo;Reasons for HIV disclosure/nondisclosure in close relationships: Testing a model of HIVdisclosure decision making,&rdquo; <em>Journal of Social and Clinical Psychology</em>, volume 23, number 6, pp. 747&ndash;767.<br>doi: <a href="https://doi.org/10.1521/jscp.23.6.747.54804" target="_blank">https://doi.org/10.1521/jscp.23.6.747.54804</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2002. &ldquo;Perceived HIV-related stigma and HIV disclosure to relationship partners after finding out about the seropositive diagnosis,&rdquo; <em>Journal of Health Psychology</em>, volume 7, number 4, pp. 415&ndash;432.<br>doi: <a href="https://doi.org/10.1177/1359105302007004330" target="_blank">https://doi.org/10.1177/1359105302007004330</a>, accessed 5 September 2020.</p>
+
+<p>Lynn Dombrowski, Ellie Harmon, and Sarah Fox, 2016. &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments,&rdquo; <em>DIS &rsquo;16: Proceedings of the 2016 ACM Conference on Designing Interactive Systems</em>, pp. 656&ndash;671.<br>doi: <a href="https://doi.org/10.1145/2901790.2901861" target="_blank">https://doi.org/10.1145/2901790.2901861</a>, accessed 5 September 2020.</p>
+
+<p>Robert W. Eisinger, Carl W. Dieffenbach, and Anthony S. Fauci, 2019. &ldquo;HIV viral load and transmissibility of HIV infection: Undetectable equals untransmittable,&rdquo; <em>Journal of the American Medical Association</em>, volume 321, number 5, pp. 451&ndash;452.<br>doi: <a href="https://doi.org/10.1001/jama.2018.21167" target="_blank">https://doi.org/10.1001/jama.2018.21167</a>, accessed 5 September 2020.</p>
+
+<p>Richard Elliot, 2002. &ldquo;Criminal law, public health and HIV transmission: A policy options paper,&rdquo; <em>UNAIDS (Joint United Nations Programme on HIV/AIDS)</em>, at <a href="https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf" target="_blank">https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf</a>, accessed 5 September 2020.</p>
+
+<p>Elizabeth F. Emens, 2008. &ldquo;Intimate discrimination: The state&rsquo;s role in the accidents of sex and love,&rdquo; <em>Harvard Law Review</em>, volume 122, number 5, pp. 1,307&ndash;1,402.<br>doi: <a href="https://doi.org/10.2307/40379752" target="_blank">https://doi.org/10.2307/40379752</a>, accessed 5 September 2020.</p>
+
+<p>Steven Epstein, 1996. <em>Impure science: AIDS, activism, and the politics of knowledge</em>. Berkeley: University of California Press.</p>
+
+<p>Amy L. Fairchild, Ronald Bayer, and James Colgrove, with Daniel Wolfe, 2007. <em>Searching eyes: Privacy, the state, and disease surveillance in America</em>. Berkeley: University of California Press.</p>
+
+<p>Mary D. Fan, 2012. &ldquo;Decentralizing STD surveillance: Toward better informed sexual consent,&rdquo; <em>Yale Journal of Health Policy, Law, and Ethics</em>, volume 12, number 1, pp. 1&ndash;38.</p>
+
+<p>Mary D. Fan, 2011. &ldquo;Sex, privacy, and public health in a casual encounters culture,&rdquo; <em>University of California Davis Law Review</em>, volume 25, pp. 531&ndash;596.</p>
+
+<p>Tim Fitzsimons, 2019. &ldquo;Inside Grindr, fears that China wanted to access user data via HIV research,&rdquo; <em>NBC News</em> (2 April), at <a href="https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996" target="_blank">https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996</a>, accessed 5 September 2020.</p>
+
+<p>Chandra L. Ford, Kathryn D. Whetten, Susan A. Hall, Jay S. Kaufman, and Angela D. Thrasher, 2007. &ldquo;Black sexuality, social construction, and research targeting &lsquo;The Down Low&rsquo; (&lsquo;The DL&rsquo;),&rdquo; <em>Annals of Epidemiology</em>, volume 17, number 3, pp. 209&ndash;216.<br>doi: <a href="https://doi.org/10.1016/j.annepidem.2006.09.006" target="_blank">https://doi.org/10.1016/j.annepidem.2006.09.006</a>, accessed 5 September 2020.</p>
+
+<p>A.J. Fortin, 1995. &ldquo;AIDS, surveillance, and public policy,&rdquo; <em>Research in Law and Policy Studies</em>, volume 4, pp. 173&ndash;197.</p>
+
+<p>Marilou Gagnon, 2012. &ldquo;Toward a critical response to HIV criminalization: Remarks on advocacy and social justice,&rdquo; <em>Journal of the Association of Nurses in AIDS Care</em>, volume 23, number 1, pp. 11&ndash;15.<br>doi: <a href="https://doi.org/10.1016/j.jana.2011.08.012" target="_blank">https://doi.org/10.1016/j.jana.2011.08.012</a>, accessed 5 September 2020.</p>
+
+<p>Carol L. Galletly and Steven D. Pinkerton, 2006. &ldquo;Conflicting messages: How criminal HIV disclosure laws undermine public health efforts to control the spread of HIV,&rdquo; <em>AIDS and Behavior</em>, volume 10, number 5, pp. 451&ndash;461.<br>doi: <a href="https://doi.org/10.1007/s10461-006-9117-3" target="_blank">https://doi.org/10.1007/s10461-006-9117-3</a>, accessed 5 September 2020.</p>
+
+<p>C. Galletly, Z. Lazzarini, C. Sanders, and S.D. Pinkerton, 2014. &ldquo;Criminal HIV exposure laws: Moving forward,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp. 1,011&ndash;1,013.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0731-1" target="_blank">https://doi.org/10.1007/s10461-014-0731-1</a>, accessed 5 September 2020.</p>
+
+<p>Robert C. Gallo, 2006. &ldquo;A reflection on HIV/AIDS research after 25 years,&rdquo; <em>Retrovirology</em>, volume 3, article number 72.<br>doi: <a href="https://doi.org/10.1186/1742-4690-3-72" target="_blank">https://doi.org/10.1186/1742-4690-3-72</a>, accessed 5 September 2020.</p>
+
+<p>George Gallup, Jr. and Jim Castelli, 1987. &ldquo;Poll catalogs views on AIDS by religion,&rdquo; <em>Dallas Morning News</em> (27 September), p. 45A.</p>
+
+<p>Lawrence O. Gostin, Scott Burris, and Zita Lazzarini, 1999. &ldquo;The law and the public&rsquo;s health: A study of infectious disease law in the United States,&rdquo; <em>Columbia Law Review</em>, volume 99, number 1, pp. 59&ndash;128.</p>
+
+<p>Ben Green, 2018. &ldquo;Data science as political action: Grounding data science in a politics of justice,&rdquo; <em>arXiv</em>:1811.03435 (6 November), at <a href="https://arxiv.org/abs/1811.03435" target="_blank">https://arxiv.org/abs/1811.03435</a>, accessed 5 September 2020.</p>
+
+<p>Kathryn Greene, Valerian J. Derlega, Gust A. Yep, and Sandra Petronio, 2003. <em>Privacy and disclosure of HIV in interpersonal relationships: A sourcebook for researchers and practitioners</em>. Mahwah, N.J.: Lawrence Erlbaum Associates.</p>
+
+<p>David M. Halperin, 2015. &ldquo;The biopolitics of HIV prevention discourse,&rdquo; In: Vernon W. Cisney and Nicolae Morar (editors). <em>Biopower: Foucault and beyond</em>. Chicago: University of Chicago Press, pp. 199&ndash;227.</p>
+
+<p>David M. Halperin and Trevor Hoppe (editors), 2017. <em>The war on sex</em>. Durham, N.C.: Duke University Press.</p>
+
+<p>Mark J. Handel and Irina Shklovski, 2012. &ldquo;Disclosure, ambiguity and risk reduction in real-time dating sites,&rdquo; <em>GROUP &rsquo;12: Proceedings of the 17th ACM International Conference on Supporting Group Work</em>, pp. 175&ndash;178.<br>doi: <a href="https://doi.org/10.1145/2389176.2389203" target="_blank">https://doi.org/10.1145/2389176.2389203</a>, accessed 5 September 2020.</p>
+
+<p>Jean Hardy and Silvia Lindtner, 2017. &ldquo;Constructing a desiring user: Discourse, rurality, and design in location-based social networks,&rdquo; <em>CSCW &rsquo;17: Proceedings of the 2017 ACM Conference on Computer Supported Cooperative Work and Social Computing</em>, pp. 13&ndash;25.<br>doi: <a href="https://doi.org/10.1145/2998181.2998347" target="_blank">https://doi.org/10.1145/2998181.2998347</a>, accessed 5 September 2020.</p>
+
+<p>Dini Harsono, Carol L. Galletly, Elaine O&rsquo;Keefe, and Zita Lazzarini, 2017. &ldquo;Criminalization of HIV exposure: A review of empirical studies in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 21, no. 1, pp. 27&ndash;50.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1540-5" target="_blank">https://doi.org/10.1007/s10461-016-1540-5</a>, accessed 5 September 2020.</p>
+
+<p>Trevor Hoppe, 2018. <em>Punishing disease: HIV and the criminalization of sickness</em>. Berkeley: University of California Press.</p>
+
+<p>Hsiu-Fang Hsieh and Sarah E. Shannon, 2005. &ldquo;Three approaches to qualitative content analysis,&rdquo; <em>Qualitative Health Research</em>, volume 15, number 9, pp. 1,277&ndash;1,288.<br>doi: <a href="https://doi.org/10.1177/1049732305276687" target="_blank">https://doi.org/10.1177/1049732305276687</a>, accessed 5 September 2020.</p>
+
+<p>Jevan A. Hutson, Jessie G. Taft, Solon Barocas, and Karen Levy, 2018. &ldquo;Debiasing desire: Addressing bias &amp; discrimination on intimate platforms,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 73.<br>doi: <a href="https://doi.org/10.1145/3274342" target="_blank">https://doi.org/10.1145/3274342</a>, accessed 5 September 2020.</p>
+
+<p>Lilly Irani, Janet Vertesi, Paul Dourish, Kavita Philip, and Rebecca E. Grinter, 2010. &ldquo;Postcolonial computing: A lens on design and development,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,311&ndash;1,320.<br>doi: <a href="https://doi.org/10.1145/1753326.1753522" target="_blank">https://doi.org/10.1145/1753326.1753522</a>, accessed 5 September 2020.</p>
+
+<p>Steven J. Jackson, Tarleton Gillespie, and Sandy Payette, 2014. &ldquo;The policy knot: Re-integrating policy, practice and design in cscw studies of social computing,&rdquo; <em>CSCW &rsquo;14: Proceedings of the 17th ACM Conference on Computer Supported Cooperative Work &amp; Social Computing</em>, pp. 588&ndash;602.<br>doi: <a href="https://doi.org/10.1145/2531602.2531674" target="_blank">https://doi.org/10.1145/2531602.2531674</a>, accessed 5 September 2020.</p>
+
+<p>Paula C. Johnson, 1992. &ldquo;Silence equals death: The response to AIDS within communities of color,&rdquo; <em>University of Illinois Law Review</em>, volume 1992, pp. 1,075&ndash;1,083.</p>
+
+<p>Ralf J&uuml;rgens, Jonathan Cohen, Edwin Cameron, Scott Burris, Michaela Clayton, Richard Elliott, Richard Pearshouse, Anne Gathumbi, and Delme Cupido, 2009. &ldquo;Ten reasons to oppose the criminalization of HIV exposure or transmission,&rdquo; <em>Reproductive Health Matters</em>, volume 17, number 34, pp. 163&ndash;172.<br>doi: <a href="https://doi.org/10.1016/S0968-8080(09)34462-6" target="_blank">https://doi.org/10.1016/S0968-8080(09)34462-6</a>, accessed 5 September 2020.</p>
+
+<p>Gopinaath Kannabiran, Shaowen Bardzell, and Jeffrey Bardzell, 2012. &ldquo;Designing (for) desire: a critical study of technosexuality in HCI,&rdquo; <em>NordiCHI &rsquo;12: Proceedings of the Seventh Nordic Conference on Human-Computer Interaction: Making Sense Through Design</em>, pp. 655&ndash;664.<br>doi: <a href="https://doi.org/10.1145/2399016.2399116" target="_blank">https://doi.org/10.1145/2399016.2399116</a>, accessed 5 September 2020.</p>
+
+<p>C&eacute;cile Kazatchkine, Edwin Bernard, and Patrick Eba, 2015. &ldquo;Ending overly broad HIV criminalization: Canadian scientists and clinicians stand for justice,&rdquo; <em>Journal of the International AIDS Society</em>, volume 18, number 1, pp. 201&ndash;226.<br>doi: <a href="https://doi.org/10.7448/IAS.18.1.20126" target="_blank">https://doi.org/10.7448/IAS.18.1.20126</a>, accessed 5 September 2020.</p>
+
+<p>Os Keyes, Jevan Hutson, and Meredith Durbin, 2019. &ldquo;A mulching proposal: Analysing and improving an algorithmic system for turning the elderly into high-nutrient slurry,&rdquo; <em>CHI EA &rsquo;19: Extended Abstracts of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number alt06.<br>doi: <a href="https://doi.org/10.1145/3290607.3310433" target="_blank">https://doi.org/10.1145/3290607.3310433</a>, accessed 5 September 2020.</p>
+
+<p>Jeffrey V. Lazarus, Kelly Safreed-Harmon, Simon E. Barton, Dominique Costagliola, Nikos Dedes, Julia del Amo Valero, Jose M. Gatell, Ricardo Baptista-Leite, Lus Mend&atilde;o, Kholoud Porter, Stefano Vella, and J&uuml;rgen Kurt Rockstroh, 2016. &ldquo;Beyond viral suppression of HIV &mdash; The new quality of life frontier,&rdquo; <em>BMC Medicine</em>, volume 14, number 1, article number 94.<br>doi: <a href="https://doi.org/10.1186/s12916-016-0640-4" target="_blank">https://doi.org/10.1186/s12916-016-0640-4</a>, accessed 5 September 2020.</p>
+
+<p>J. Stan Lehman, Meredith H. Carr, Allison J. Nichol, Alberto Ruisanchez, David W. Knight, Anne E. Langford, Simone C. Gray, and Jonathan H. Mermin, 2014. &ldquo;Prevalence and public health implications of state laws that criminalize potential HIV exposure in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp.997&ndash;1,006.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0724-0" target="_blank">https://doi.org/10.1007/s10461-014-0724-0</a>, accessed 5 September 2020.</p>
+
+<p>Karen Levy and Solon Barocas, 2018. &ldquo;Designing against discrimination in online markets,&rdquo; <em>Berkeley Technology Law Journal</em>, volume 32, number 3, pp. 1,183&ndash;1,237.<br>doi: <a href="https://doi.org/10.15779/Z38BV79V7K" target="_blank">https://doi.org/10.15779/Z38BV79V7K</a>, accessed 5 September 2020.</p>
+
+<p>Eric Lichtblau and William M. Arkin, 2014. &ldquo;More federal agencies are using undercover operations,&rdquo; <em>New York Times</em> (15 November), at <a href="https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html" target="_blank">https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html</a>, accessed 5 September 2020.</p>
+
+<p>Ann Light, 2011. &ldquo;HCI as heterodoxy: Technologies of identity and the queering of interaction with computers,&rdquo; <em>Interacting with Computers</em>, volume 23, number 5, pp. 430&ndash;438.<br>doi: <a href="https://doi.org/10.1016/j.intcom.2011.02.002" target="_blank">https://doi.org/10.1016/j.intcom.2011.02.002</a>, accessed 5 September 2020.</p>
+
+<p>Ben Light, Jean Burgess, and Stefanie Duguay, 2018. &ldquo;The walkthrough method: An approach to the study of apps,&rdquo; <em>New Media &amp; Society</em>, volume 20, number 3, pp. 881&ndash;900.<br>doi: <a href="https://doi.org/10.1177/1461444816675438" target="_blank">https://doi.org/10.1177/1461444816675438</a>, accessed 5 September 2020.</p>
+
+<p>Anish P. Mahajan, Jennifer N. Sayles, Vishal A. Patel, Robert H. Remien, Daniel Ortiz, Greg Szekeres, and Thomas J. Coates, 2008. &ldquo;Stigma in the HIV/AIDS epidemic: A review of the literature and recommendations for the way forward,&rdquo; <em>AIDS</em>, volume 22, supplement 2, pp. S67&ndash;S79.<br>doi: <a href="https://doi.org/10.1097/01.aids.0000327438.13291.62" target="_blank">https://doi.org/10.1097/01.aids.0000327438.13291.62</a>, accessed 5 September 2020.</p>
+
+<p>Alexandra McCallum, 2014. &ldquo;Criminalizing the transmission of HIV: Consent, disclosure, and online dating,&rdquo; <em>Utah Law Review</em>, volume 2014, number 3, article 5, at <a href="https://dc.law.utah.edu/ulr/vol2014/iss3/5" target="_blank">https://dc.law.utah.edu/ulr/vol2014/iss3/5</a>, accessed 5 September 2020.</p>
+
+<p>Donna Hubbard McCree and Matthew Hogben, 2010. &ldquo;The contribution to and context of other sexually transmitted diseases and tuberculosis in the HIV/AIDS epidemic among African Americans,&rdquo; In: Donna Hubbard McCree, Kenneth Jones, and Ann O&rsquo;Leary (editors). <em>African Americans and HIV/AIDS: Understanding and addressing the epidemic</em>, New York: Springer, pp. 3&ndash;12.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>William C. Miller, Carol A. Ford, Martina Morris, Mark S. Handcock, John L. Schmitz, Marcia M. Hobbs, Myron S. Cohen, Kathleen Mullan Harris, and J. Richard Udry, 2004. &ldquo;Prevalence of chlamydial and gonococcal infections among young adults in the United States,&rdquo; <em>Journal of the American Medical Association</em>, volume 291, number 18, pp. 2,229&ndash;2,236.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>Viviane Namaste, 2015. <em>Oversight: Critical reflections on feminist research and politics</em>. Toronto: Women&rsquo;s Press.</p>
+
+<p>Angela Perone, 2013. &ldquo;From punitive to proactive: An alternative approach for responding to HIV criminalization that departs from penalizing marginalized communities,&rdquo; <em>Hastings Women&rsquo;s Law Journal</em>, volume 24, pp. 363&ndash;406, and at <a href="https://repository.uchastings.edu/hwlj/vol24/iss2/5" target="_blank">https://repository.uchastings.edu/hwlj/vol24/iss2/5</a>, accessed 5 September 2020.</p>
+
+<p>Deana A. Pollard, 2006. &ldquo;Sex torts,&rdquo; <em>Minnesota Law Review</em>, volume 91, pp. 769&ndash;824, and at <a href="https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf" target="_blank">https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf</a>, accessed 5 September 2020.</p>
+
+<p>POZ, 2015. &ldquo;Man with HIV arrested for seeking sex on social media&rdquo;(22 July 22), at <a href="https://www.poz.com/article/stlouis-hiv-arrest-27534-4846" target="_blank">https://www.poz.com/article/stlouis-hiv-arrest-27534-4846</a>, accessed 5 September 2020.</p>
+
+<p>Russell K. Robinson, 2007. &ldquo;Structural dimensions of romantic preferences,&rdquo; <em>Fordham Law Review</em>, volume 76, pp. 2,787&ndash;2,820, and at <a href="http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/" target="_blank">http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/</a>, accessed 5 September 2020.</p>
+
+<p>Michael J. Rosenfeld and Reuben J. Thomas, 2012. &ldquo;Searching for a mate: The rise of the Internet as a social intermediary,&rdquo; <em>American Sociological Review</em>, volume 77, number 4, pp. 523&ndash;547.<br>doi: <a href="https://doi.org/10.1177/0003122412448050" target="_blank">https://doi.org/10.1177/0003122412448050</a>, accessed 5 September 2020.</p>
+
+<p>B.R. Simon Rosser, J. Michael Wilkerson, Derek J. Smolenski, J. Michael Oakes, Joseph Konstan, Keith J. Horvath, Gunna R. Kilian, David S. Novak, Gene P. Danilenko, and Richard Morgan, 2011. &ldquo;The future of Internet-based HIV prevention: A report on key findings from the Men&rsquo;s INTernet (MINTS-I, II) Sex Studies,&rdquo; <em>AIDS and Behavior</em>, volume 15, supplement 1, pp. S91&ndash;S100.<br>doi: <a href="https://doi.org/10.1007/s10461-011-9910-5" target="_blank">https://doi.org/10.1007/s10461-011-9910-5</a>, accessed 5 September 2020.</p>
+
+<p>Brian Schram, 2019. &ldquo;Accidental orientations: Rethinking queerness in archival times,&rdquo; <em>Surveillance &amp; Society</em>, volume 17, number 5, pp. 602&ndash;617.<br>doi: <a href="https://doi.org/10.24908/ss.v17i5.8688" target="_blank">https://doi.org/10.24908/ss.v17i5.8688</a>, accessed 5 September 2020.</p>
+
+<p>Junichi P. Semitsu, 2011. &ldquo;From Facebook to mug shot: How the dearth of social networking privacy rights revolutionized online government surveillance,&rdquo; <em>Pace Law Review</em>, volume 31, number 1, pp. 291&ndash;381, and at <a href="https://digitalcommons.pace.edu/plr/vol31/iss1/7" target="_blank">https://digitalcommons.pace.edu/plr/vol31/iss1/7</a>, accessed 5 September 2020.</p>
+
+<p>Sero Project, 2012, &ldquo;National criminalization survey preliminary results,&rdquo; (25 July), at <a href="https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/" target="_blank">https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/</a>, accessed 30 August 2019.</p>
+
+<p>Julianne M. Serovich and Katie E. Mosack, 2003. &ldquo;Reasons for HIV disclosure or nondisclosure to casual sexual partners,&rdquo; <em>AIDS Education and Prevention</em>, volume 15, number 1, pp. 70&ndash;80.</p>
+
+<p>Natasha Singer, 2018. &ldquo;Grindr sets off privacy firestorm after sharing users&rsquo; H.I.V.-status data,&rdquo; <em>New York Times</em> (3 April), at <a href="https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html" target="_blank">https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html</a>, accessed 5 September 2020.</p>
+
+<p>Lucy Suchman, 2011. &ldquo;Anthropological relocations and the limits of design,&rdquo; <em>Annual Review of Anthropology</em>, volume 40, pp. 1&ndash;18.<br>doi: <a href="https://doi.org/10.1146/annurev.anthro.041608.105640" target="_blank">https://doi.org/10.1146/annurev.anthro.041608.105640</a>, accessed 5 September 2020.</p>
+
+<p>Cass R. Sunstein, 1996. &ldquo;Social norms and social roles,&rdquo; <em>Columbia Law Review</em>, volume 96, number 4, pp. 903&ndash;968.</p>
+
+<p>Patricia Sweeney, Simone C. Gray, David W. Purcell, Jenny Sewell, Aruna Surendera Babu, Brett A. Tarver, Joseph Prejean, and Jonathan Mermin, 2017. &ldquo;Association of HIV diagnosis rates and laws criminalizing HIV exposure in the United States,&rdquo; <em>AIDS</em>, volume 31, number 10, pp. 1,483&ndash;1,488.<br>doi: <a href="https://doi.org/10.1097/QAD.0000000000001501" target="_blank">https://doi.org/10.1097/QAD.0000000000001501</a>, accessed 5 September 2020.</p>
+
+<p>Bryan L. Sykes, Trevor A. Hoppe, and Kristen D. Maziarka, 2016. &ldquo;Cruel intentions? HIV prevalence and criminalization during an age of mass incarceration, U.S. 1999 to 2012,&rdquo; <em>Medicine (Baltimore)</em>, volume 95, number 16, e3352.<br>doi: <a href="https://doi.org/10.1097/MD.0000000000003352" target="_blank">https://doi.org/10.1097/MD.0000000000003352</a>, accessed 5 September 2020.</p>
+
+<p>Samuel Hardman Taylor, Jevan Alexander Hutson, and Tyler Richard Alicea, 2017. &ldquo;Social consequences of Grindr use: Extending the Internet-enhanced self-disclosure hypothesis,&rdquo; <em>CHI &rsquo;17: Proceedings of the 2017 CHI Conference on Human Factors in Computing Systems</em>, pp. 6,645&ndash;6,657.<br>doi: <a href="https://doi.org/10.1145/3025453.3025775" target="_blank">https://doi.org/10.1145/3025453.3025775</a>, accessed 5 September 2020.</p>
+
+<p>Steven Thrasher, 2015. &ldquo;A Black body on trial: The conviction of HIV-positive &lsquo;Tiger Mandingo&rsquo;,&rdquo; <em>BuzzFeed News</em> (30 November), at <a href="https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m" target="_blank">https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m</a>, accessed 5 September 2020.</p>
+
+<p>Liming Wang, Dylan Podson, Zihuang Chen, Hongyan Lu, Vania Wang, Colin Shepard, John K. Williams, and Guodong Mi, 2019. &ldquo;Using social media to increase HIV testing among men who have sex with men &mdash; Beijing, China, 2013&ndash;2017,&rdquo; <em>Morbidity and Mortality Weekly Report</em>, volume 68, number 21, pp. 478&ndash;482.<br>doi: <a href="http://dx.doi.org/10.15585/mmwr.mm6821a3" target="_blank">http://dx.doi.org/10.15585/mmwr.mm6821a3</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward. 2005. &ldquo;Partner notification and contact-tracing,&rdquo; <em>Medicine</em>, volume 33, number 9, pp. 28&ndash;30.<br>doi: <a href="https://doi.org/10.1383/medc.2005.33.9.28" target="_blank">https://doi.org/10.1383/medc.2005.33.9.28</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward and Gill Bell, 2014. &ldquo;Partner notification,&rdquo; <em>Medicine (Abingdon)</em>, volume 42, number 6, pp. 314&ndash;317.<br>doi: <a href="https://doi.org/10.1016/j.mpmed.2014.03.013" target="_blank">https://doi.org/10.1016/j.mpmed.2014.03.013</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Andreas Gutmann, M. Angela Sasse, and Ann Blandford, 2018. &ldquo;Privacy unraveling around explicit HIV status disclosure fields in the online geosocial hookup app Grindr,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 181.<br>doi: <a href="https://doi.org/10.1145/3274450" target="_blank">https://doi.org/10.1145/3274450</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Juan F. Maestre, Jo Gibbs, Chia-Fang Chung, and Ann Blandford, 2019. &ldquo;Signal appropriation of explicit HIV status disclosure fields in sex-social apps used by gay and bisexual men,&rdquo; <em>CHI &rsquo;19: Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number 692.<br>doi: <a href="https://doi.org/10.1145/3290605.3300922" target="_blank">https://doi.org/10.1145/3290605.3300922</a>, accessed 5 September 2020.</p>
+
+<p>Dylan Eric Wittkower, 2016. &ldquo;Lurkers, creepers, and virtuous interactivity: From property rights to consent to care as a conceptual basis for privacy concerns and information ethics,&rdquo; <em>First Monday</em>, volume 21, number 10, at <a href="https://firstmonday.org/article/view/6948/5628" target="_blank">https://firstmonday.org/article/view/6948/5628</a>, accessed 5 September 2020.<br>doi: <a href="https://doi.org/10.5210/fm.v21i10.6948" target="_blank">https://doi.org/10.5210/fm.v21i10.6948</a>, accessed 5 September 2020.</p>
+
+<p>Dan Wohlfeiler, Jennifer Hecht, Jonathan Volk, H. Fisher Raymond, Tom Kennedy, and Willi McFarland, 2013. &ldquo;How can we improve online HIV and STD prevention for men who have sex with men? Perspectives of hook-up website owners, website users, and HIV/STD directors,&rdquo; <em>AIDS and Behavior</em>, volume 17, number 9, pp. 3,024&ndash;3,033.<br>doi: <a href="https://doi.org/10.1007/s10461-012-0375-y" target="_blank">https://doi.org/10.1007/s10461-012-0375-y</a>, accessed 5 September 2020.</p>
+
+<p>Mara Cecilia Zea, Carol A. Reisen, Paul J. Poppen, and Rafael M. Daz. 2003. &ldquo;Asking and telling: communication about HIV status among Latino HIV-positive gay men,&rdquo; <em>AIDS and Behavior</em>, volume 7, number 2, pp. 143&ndash;152.<br>doi: <a href="https://doi.org/10.1023/A:1023994207984" target="_blank">https://doi.org/10.1023/A:1023994207984</a>, accessed 5 September 2020.</p>
+
+<p>Shoshana Zuboff, 2019. <em>The age of surveillance capitalism: The fight for a human future at the new frontier of power</em>. London: Profile Books.</p>
+
+<p>&nbsp;</p>
+<hr width="300">
+
+<p><strong>Editorial history</strong></p>
+<p>Received 17 October 2019; revised 12 February 2020; accepted 28 August 2020.</p>
+
+<hr>
+
+<p><a href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" src="https://i.creativecommons.org/l/by/4.0/80x15.png"></a><br>This paper is licensed under a <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</p>
+
+<p>Surveillance, stigma &amp; sociotechnical design for HIV<br>by Calvin Liang, Jevan Alexander Hutson, and Os Keyes.<br><em>First Monday</em>, Volume 25, Number 10 - 5 October 2020<br>https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729<br>doi: <a href="http://dx.doi.org/10.5210/fm.v25i10.10274" target="_blank">http://dx.doi.org/10.5210/fm.v25i10.10274</a></p>
+</blockquote>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_landingpage.html b/python/tests/files/first_monday_ojs3_landingpage.html
new file mode 100644
index 0000000..2633256
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_landingpage.html
@@ -0,0 +1,616 @@
+ <!DOCTYPE html>
+<html lang="en-US" xml:lang="en-US">
+<head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ | First Monday
+ </title>
+
+
+<meta name="generator" content="Open Journal Systems 3.1.2.0">
+<link rel="icon" href="https://firstmonday.org/ojs/public/journals/3/favicon_en_US.gif">
+<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+<meta name="DC.Coverage" xml:lang="en" content=""/>
+<meta name="DC.Creator.PersonalName" content="Calvin Liang"/>
+<meta name="DC.Creator.PersonalName" content="Jevan Alexander Hutson"/>
+<meta name="DC.Creator.PersonalName" content="Os Keyes"/>
+<meta name="DC.Date.created" scheme="ISO8601" content="2020-09-10"/>
+<meta name="DC.Date.dateSubmitted" scheme="ISO8601" content="2019-09-15"/>
+<meta name="DC.Date.issued" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Date.modified" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Description" xml:lang="en" content="Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."/>
+<meta name="DC.Format" scheme="IMT" content="text/html"/>
+<meta name="DC.Identifier" content="10274"/>
+<meta name="DC.Identifier.DOI" content="10.5210/fm.v25i10.10274"/>
+<meta name="DC.Identifier.URI" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="DC.Language" scheme="ISO639-1" content="en"/>
+<meta name="DC.Rights" content="Copyright (c) 2020 First Monday"/>
+<meta name="DC.Rights" content=""/>
+<meta name="DC.Source" content="First Monday"/>
+<meta name="DC.Source.ISSN" content="1396-0466"/>
+<meta name="DC.Source.URI" content="https://firstmonday.org/ojs/index.php/fm"/>
+<meta name="DC.Subject" xml:lang="en" content="HIV"/>
+<meta name="DC.Subject" xml:lang="en" content="online dating"/>
+<meta name="DC.Subject" xml:lang="en" content="design"/>
+<meta name="DC.Subject" xml:lang="en" content="policy"/>
+<meta name="DC.Subject" xml:lang="en" content="surveillance"/>
+<meta name="DC.Subject" xml:lang="en" content="intimacy"/>
+<meta name="DC.Subject" xml:lang="en" content="social computing"/>
+<meta name="DC.Subject" xml:lang="en" content="social justice"/>
+<meta name="DC.Title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="DC.Type" content="Text.Serial.Journal"/>
+<meta name="DC.Type" xml:lang="en" content="Qualitative; Content analysis"/>
+<meta name="DC.Type.articleType" content="Articles"/>
+<meta name="gs_meta_revision" content="1.1"/>
+<meta name="citation_journal_title" content="First Monday"/>
+<meta name="citation_journal_abbrev" content="1"/>
+<meta name="citation_issn" content="1396-0466"/>
+<meta name="citation_author" content="Calvin Liang"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_author" content="Jevan Alexander Hutson"/>
+<meta name="citation_author_institution" content="University of Washington, School of Law"/>
+<meta name="citation_author" content="Os Keyes"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="citation_date" content="2020/09/10"/>
+<meta name="citation_doi" content="10.5210/fm.v25i10.10274"/>
+<meta name="citation_abstract_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="citation_language" content="en"/>
+<meta name="citation_keywords" xml:lang="en" content="HIV"/>
+<meta name="citation_keywords" xml:lang="en" content="online dating"/>
+<meta name="citation_keywords" xml:lang="en" content="design"/>
+<meta name="citation_keywords" xml:lang="en" content="policy"/>
+<meta name="citation_keywords" xml:lang="en" content="surveillance"/>
+<meta name="citation_keywords" xml:lang="en" content="intimacy"/>
+<meta name="citation_keywords" xml:lang="en" content="social computing"/>
+<meta name="citation_keywords" xml:lang="en" content="social justice"/>
+<meta name="citation_fulltext_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"/>
+<link rel="alternate" type="application/atom+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+<link rel="alternate" type="application/rdf+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+<link rel="alternate" type="application/rss+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <link rel="stylesheet" href="https://firstmonday.org/ojs/index.php/fm/$$$call$$$/page/page/css?name=stylesheet" type="text/css" /><link rel="stylesheet" href="//fonts.googleapis.com/css?family=Noto+Sans:400,400italic,700,700italic" type="text/css" /><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.css" type="text/css" /><link rel="stylesheet" href="https://firstmonday.org/ojs/public/journals/3/styleSheet.css" type="text/css" />
+</head>
+<body class="pkp_page_article pkp_op_view has_site_logo" dir="ltr">
+
+ <div class="cmp_skip_to_content">
+ <a href="#pkp_content_main">Skip to main content</a>
+ <a href="#pkp_content_nav">Skip to main navigation menu</a>
+ <a href="#pkp_content_footer">Skip to site footer</a>
+ </div>
+ <div class="pkp_structure_page">
+
+ <header class="pkp_structure_head" id="headerNavigationContainer" role="banner">
+ <div class="pkp_head_wrapper">
+
+ <div class="pkp_site_name_wrapper">
+ <div class="pkp_site_name">
+ <a href=" https://firstmonday.org/ojs/index.php/fm/index
+ " class="is_img">
+ <img src="https://firstmonday.org/ojs/public/journals/3/pageHeaderLogoImage_en_US.gif" width="252" height="102" alt="Page Header Logo" />
+ </a>
+ </div>
+ </div>
+
+
+ <nav class="pkp_navigation_primary_row" aria-label="Site Navigation">
+ <div class="pkp_navigation_primary_wrapper">
+ <ul id="navigationPrimary" class="pkp_navigation_primary pkp_nav_list">
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About
+ </a>
+ <ul>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About the Journal
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/editorialTeam">
+ Editorial Team
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/privacy">
+ Privacy Statement
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/contact">
+ Contact
+ </a>
+ </li>
+ </ul>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search">
+ Search
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/current">
+ Current
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/announcement">
+ Announcements
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/submissions">
+ Submissions
+ </a>
+ </li>
+ </ul>
+
+
+
+ <form class="pkp_search" action="https://firstmonday.org/ojs/index.php/fm/search/search" method="post" role="search">
+ <input type="hidden" name="csrfToken" value="671acac3a608346eb0eb4de1f26c7563">
+ <input name="query" value="" type="text" aria-label="Search Query">
+ <button type="submit">
+ Search
+ </button>
+ <div class="search_controls" aria-hidden="true">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search" class="headerSearchPrompt search_prompt" aria-hidden="true">
+ Search
+ </a>
+ <a href="#" class="search_cancel headerSearchCancel" aria-hidden="true"></a>
+ <span class="search_loading" aria-hidden="true"></span>
+ </div>
+</form>
+ </div>
+ </nav>
+
+ <nav class="pkp_navigation_user_wrapper" id="navigationUserWrapper" aria-label="User Navigation">
+ <ul id="navigationUser" class="pkp_navigation_user pkp_nav_list">
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/user/register">
+ Register
+ </a>
+ </li>
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/login">
+ Login
+ </a>
+ </li>
+ </ul>
+
+ </nav>
+ </div><!-- .pkp_head_wrapper -->
+ </header><!-- .pkp_structure_head -->
+
+ <div class="pkp_structure_content has_sidebar">
+ <div id="pkp_content_main" class="pkp_structure_main" role="main">
+
+<div class="page page_article">
+ <nav class="cmp_breadcrumbs" role="navigation" aria-label="You are here:">
+ <ol>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/index">
+ Home
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li class="current">
+ Articles
+ </li>
+ </ol>
+</nav>
+
+ <article class="obj_article_details">
+ <h1 class="page_title">
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ </h1>
+
+
+ <div class="row">
+ <div class="main_entry">
+
+ <ul class="item authors">
+ <li>
+ <span class="name">
+ Calvin Liang
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0002-3795-3441" target="_blank">
+ https://orcid.org/0000-0002-3795-3441
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Jevan Alexander Hutson
+ </span>
+ <span class="affiliation">
+ University of Washington, School of Law
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0003-3312-1733" target="_blank">
+ https://orcid.org/0000-0003-3312-1733
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Os Keyes
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0001-5196-609X" target="_blank">
+ https://orcid.org/0000-0001-5196-609X
+ </a>
+ </span>
+ </li>
+ </ul>
+
+ <div class="item doi">
+ <span class="label">
+ DOI:
+ </span>
+ <span class="value">
+ <a href="https://doi.org/10.5210/fm.v25i10.10274">
+ https://doi.org/10.5210/fm.v25i10.10274
+ </a>
+ </span>
+ </div>
+
+ <div class="item keywords">
+ <span class="label">
+ Keywords:
+ </span>
+ <span class="value">
+ HIV, online dating, design, policy, surveillance, intimacy, social computing, social justice </span>
+ </div>
+
+ <div class="item abstract">
+ <h3 class="label">Abstract</h3>
+ <p>Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+ </div>
+
+
+
+ <div class="item author_bios">
+ <h3 class="label">
+ Author Biographies
+ </h3>
+ <div class="sub_item">
+ <div class="label">
+ Calvin Liang, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ <p>Calvin Liang is a PhD student in Human-Centered Design and Engineering at The University of Washington. Their research broadly focuses on technology’s role in and out of queerness, health, and queer health.</p>
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Jevan Alexander Hutson, <span class="affiliation">University of Washington, School of Law</span>
+ </div>
+ <div class="value">
+ Jevan Hutson is a third-year law student and Gregoire Fellow at the University of Washington School of Law. He holds an M.P.S. from the Department of Information Science at Cornell University, and a B.A. from the Department of Art History and Visual Studies at Cornell University. He has been published in venues including the Association for Computing Machinery’s conferences on Computer Human Interaction and Computer Supported Cooperative Work and Social Computing
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Os Keyes, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ Os Keyes is a PhD student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.
+ </div>
+ </div>
+ </div>
+
+
+ </div><!-- .main_entry -->
+
+ <div class="entry_details">
+
+ <div class="item cover_image">
+ <div class="sub_item">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ <img src="https://firstmonday.org/ojs/public/journals/3/cover_issue_678_en_US.png" alt="“Frank Moore, Digital Divide, 2001 gouache, oil and mixed media on paper 14 3/4 x 24 1/4 inches (36,4 x 61,6 cm) sheetâ€">
+ </a>
+ </div>
+ </div>
+
+ <div class="item galleys">
+ <ul class="value galleys_links">
+ <li>
+
+
+
+
+<a class="obj_galley_link file" href="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729">
+
+
+ HTML
+
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="item published">
+ <div class="label">
+ Published
+ </div>
+ <div class="value">
+ 2020-09-10
+ </div>
+ </div>
+
+ <div class="item citation">
+ <div class="sub_item citation_display">
+ <div class="label">
+ How to Cite
+ </div>
+ <div class="value">
+ <div id="citationOutput" role="region" aria-live="polite">
+ <div class="csl-bib-body">
+ <div class="csl-entry">Liang, C., Hutson, J. A., &#38; Keyes, O. (2020). Surveillance, stigma &amp; sociotechnical design for HIV. <i>First Monday</i>, <i>25</i>(10). https://doi.org/10.5210/fm.v25i10.10274</div>
+</div>
+ </div>
+ <div class="citation_formats">
+ <button class="cmp_button citation_formats_button" aria-controls="cslCitationFormats" aria-expanded="false" data-csl-dropdown="true">
+ More Citation Formats
+ </button>
+ <div id="cslCitationFormats" class="citation_formats_list" aria-hidden="true">
+ <ul class="citation_formats_styles">
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274&amp;return=json"
+ >
+ ACM
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274&amp;return=json"
+ >
+ ACS
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274&amp;return=json"
+ >
+ APA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274&amp;return=json"
+ >
+ ABNT
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274&amp;return=json"
+ >
+ Chicago
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274&amp;return=json"
+ >
+ Harvard
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274&amp;return=json"
+ >
+ IEEE
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274&amp;return=json"
+ >
+ MLA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274&amp;return=json"
+ >
+ Turabian
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274&amp;return=json"
+ >
+ Vancouver
+ </a>
+ </li>
+ </ul>
+ <div class="label">
+ Download Citation
+ </div>
+ <ul class="citation_formats_styles">
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/ris?submissionId=10274">
+ <span class="fa fa-download"></span>
+ Endnote/Zotero/Mendeley (RIS)
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/bibtex?submissionId=10274">
+ <span class="fa fa-download"></span>
+ BibTeX
+ </a>
+ </li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="item issue">
+ <div class="sub_item">
+ <div class="label">
+ Issue
+ </div>
+ <div class="value">
+ <a class="title" href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ </div>
+ </div>
+
+ <div class="sub_item">
+ <div class="label">
+ Section
+ </div>
+ <div class="value">
+ Articles
+ </div>
+ </div>
+ </div>
+
+
+ <div class="item copyright">
+ <p>Authors retain copyright to their work published in <em>First Monday</em>. Please see the footer of each article for details.</p>
+ </div>
+
+
+
+ </div><!-- .entry_details -->
+ </div><!-- .row -->
+
+</article>
+
+
+
+</div><!-- .page -->
+
+ </div><!-- pkp_structure_main -->
+
+ <div class="pkp_structure_sidebar left" role="complementary" aria-label="Sidebar">
+ <div class="pkp_block block_developed_by">
+ <div class="content">
+ <a href="http://pkp.sfu.ca/ojs/">
+ Open Journal Systems
+ </a>
+ </div>
+</div>
+<div class="pkp_block block_web_feed">
+ <span class="title">Current Issue</span>
+ <div class="content">
+ <ul>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/atom.svg" alt="Atom logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss20_logo.svg" alt="RSS2 logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss10_logo.svg" alt="RSS1 logo">
+ </a>
+ </li>
+ </ul>
+ </div>
+</div>
+
+ </div><!-- pkp_sidebar.left -->
+ </div><!-- pkp_structure_content -->
+
+<div id="pkp_content_footer" class="pkp_structure_footer_wrapper" role="contentinfo">
+
+ <div class="pkp_structure_footer">
+
+ <div class="pkp_footer_content">
+ <p>A Great Cities Initiative of the University of Illinois at Chicago&nbsp;<a href="http://library.uic.edu/">University Library</a>.</p>
+<p>©&nbsp;<em>First Monday</em>, 1995-2020. ISSN&nbsp;1396-0466.</p>
+ </div>
+
+ <div class="pkp_brand_footer" role="complementary">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/aboutThisPublishingSystem">
+ <img alt="About this Publishing System" src="https://firstmonday.org/ojs/templates/images/ojs_brand.png">
+ </a>
+ </div>
+ </div>
+</div><!-- pkp_structure_footer_wrapper -->
+
+</div><!-- pkp_structure_page -->
+
+<script src="//ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js" type="text/javascript"></script><script src="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/jquery-ui.min.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/lib/pkp/js/lib/jquery/plugins/jquery.tag-it.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/popper/popper.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/util.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/dropdown.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/main.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/generic/citationStyleLanguage/js/articleCitation.js" type="text/javascript"></script><script type="text/javascript">
+(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+ga('create', 'UA-41314203-1', 'auto');
+ga('send', 'pageview');
+</script>
+
+
+</body>
+</html>
diff --git a/python/tests/files/genders_g58_fairlie.html b/python/tests/files/genders_g58_fairlie.html
new file mode 100644
index 0000000..49cada8
--- /dev/null
+++ b/python/tests/files/genders_g58_fairlie.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+<title>Genders OnLine Journal - Genders OnLine Journal - Presenting innovative theories in art, literature, history, music, TV and film.</title>
+<meta name="description" content="Analysis of Hitchcock’s Rope (1948) as a critique of heteromasculinity that thematizes queer anguish, orality, and women’s relationship to the covert world of homosexual knowledge.">
+<meta name="keywords" content="homosexuality, homophobia, Cold War, the closet, heteromasculinity, queer anguish, anus, suspicion, orality, eating, cannibalism, Catholicism, knowledge, the cinematic cut, cinematic reality, women in Hitchcock, women and gay men, lack, hypocrisy, straight male interlocutor.">
+<style type="text/css">
+<!--
+
+td {
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 13px;
+}
+
+.Section1 {
+ page:Section1;
+}
+-->
+</style>
+</head>
+<body alink="#000088" background="../image/back.jpg" vlink="#00aa00">
+<p>
+<table width="600">
+ <tbody>
+ <tr>
+ <td valign="top" width="90"><p><img src="../image/indlgo.gif" alt="Genders OnLine Journal" align="bottom" border="0" height="530" width="97"> </p></td>
+ <td align="right" valign="top" width="530"><table width="530">
+ <tbody>
+ <tr>
+ <td valign="top"><p><b><font size="2">Issue 58</font></b>, Fall 2013</p>
+ <p><font size="5"><strong>Reading Maeshowe</strong></font> <br>
+ Recovering the Feminine in a Neolithic Tomb</p>
+<p>By <strong>CHARLOTTE FAIRLIE</strong></p>
+ <p>[1] Cuween, a small Neolithic cairn, perches on top of a hill on the Orkney Mainland. A flashlight waits in a bucket by the door, and visitors crawl on hands and knees, one by one, into the pitch-black interior. After savoring a degree of darkness rare in modern life, they direct beams of light up the tapering walls to marvel at the skill of the stonemasons. It is impossible to resist the impulse to clamber into the chambers and crouch where the bones once lay. Green and smooth, Maeshowe, another Orkney cairn, rises enigmatically from the field where it has stood since around 2700 BC. The designation of this monument and the surrounding Neolithic structures as a UNESCO World Heritage Site (WHS) in 1999 significantly increased tourism to the area (Card et al. 429), so while visitors may still enter Cuween unsupervised, access to the much larger Maeshowe now requires a timed ticket, bought in advance. Throughout the year, thousands of visitors, bending uncomfortably low, shuffle through the tunnel-like passage entry, making the physical journey from light to dark and a more psychological journey from present to past. Exploring any of the Neolithic sites in Orkney is to bridge time, to feel kinship with those who built them.</p>
+ <p>[2] Without doubt, a major reason Maeshowe attracts so many people is its symbiotic relationship with its environment. Most famously, at sundown during the December solstice, the winter sun lines up with the door of the tomb, shines down the passage, and focuses its rays on the stone wall within. Interest in this phenomenon, the moment when the light stabs the darkness, is so high that Historic Scotland provides web-cam coverage, but Maeshowe fascinates others besides tourists and solstice celebrants. Whether they are vacation visitors, archaeologists, anthropologists, or poets, explorers experience the sites differently, applying their own intellectual tools and imagining Neolithic lives from their respective points of view. Leslie Riddoch has written that these are &ldquo;Stone Age marvels which inspire and astonish,&rdquo; and Simon W. Hall expresses the experiences of many when he refers to &ldquo;the profound impact of entering a tomb&rdquo; (160). They imply that to enter a cairn is to become one with it, to undergo a transformation. Maeshowe, which can now be experienced only under the regimented conditions required by the Historic Scotland guides, clearly retains extraordinary power to inspire. Indeed, this ancient mound has attracted a great deal of literary attention from both noted and obscure writers. Considering these cumulative interpretations, rather than relying solely on the work of archaeologists, opens up a more comprehensive, textured, and, indeed, gendered understanding of ancient history and our commonality with Neolithic peoples.</p>
+ <p> [3] George Mackay Brown, Kathleen Jamie, Myra Schneider, and Dilys Rose are four of the more prominent authors for whom Maeshowe has proven inspirational. They have experienced the tomb through a doubly imaginative process: first by reading it as they would read a poem and then by expressing that interpretation in writing. While Brown was an Orcadian, living most of his life alongside the Neolithic sites, Jamie, Schneider, and Rose, all of whom have Scottish roots, experience Maeshowe as tourists, drawn across the Pentland Firth to enter the passage and travel into the darkness. Significantly, all three of these more contemporary writers are women. Hall, in his valuable survey, <u>The History of Orkney Literature</u>, contrasts the use of the prehistoric by female Scottish writers with that of their male counterparts, stating that it is less political, that women authors take &ldquo;the opportunity to reestablish the place&mdash;and, significantly, the inner lives of women in the prehistoric or early historical northern landscape&rdquo; (162-163). I would argue, however, that their work also engages the public world to a greater extent and is more ideological than this statement implies. Jamie&rsquo;s, Schneider&rsquo;s, and Rose&rsquo;s experiences in Maeshowe lead to readings of the monument that build on the archaeological interpretations, allowing us to consider the possibility of ancient gender power struggles and raising our awareness of the deep roots of masculine dominance.</p>
+ <p>[4] Archaeologist Colin Richards, who has written extensively about The Heart of Neolithic Orkney WHS, describes how visiting cairns must also have affected prehistoric visitors: &ldquo;the journey will be one of consequence.&rdquo; Moving from the light of day to the dark mysteries of a tomb&rsquo;s interior &ldquo;is a passage from the profane to the sacred.&rdquo; As such, &ldquo;it will involve transformation&rdquo; (&ldquo;Doorways&rdquo; 70-71). However, the nature of the transformation is mysterious. Referring to single-chambered structures divided into stalls, he continues, &ldquo;If the Orkney-Cromarty &lsquo;chambered&rsquo; tombs are principally conceived as a series of doorways, the question arises: where are they leading? To what goal?&rdquo; (71). In discussing the relationship between buildings and the people who used them thousands of years ago, Richards considers the figurative significance of doors. In doing so, he treats the tombs as if they were literary texts with debatable meaning, having previously pointed out that &ldquo;the architecture of a chambered tomb relied on analogy and metaphor for its understanding and interpretation&rdquo; (&ldquo;Doorways&rdquo; 67). Rather than merely being repositories for bones, the tombs, Richards asserts, were &ldquo;built to be experienced visually, physically and imaginatively,&rdquo; an experience which may well result in some kind of &ldquo;revelation&rdquo; (&ldquo;Doorways.&rdquo; 69, 70, 76). Since he argues that buildings carry metaphoric meaning, open to imaginative interpretation, it is entirely appropriate that, when explaining this, Richards also changes to the historical present tense. His grammatical shift emphasizes that like <u>Beowulf</u>, <u>Hamlet</u>, or <u>Moby Dick</u>, tombs such as Maeshowe transcend time and are open to new readings, whether by trained archaeologists, pilgrims, casual visitors, or writers.</p>
+ <p>[5] Robert Crawford draws more explicit parallels between Maeshowe itself and literature in his essay, &ldquo;Maes Howe Sappho.&rdquo; Noting the continuing appeal of the tomb, how today &ldquo;people still treasure&rdquo; the moment that the sun lines up with the passage, he compares the ancient monument to poetry:</p><blockquote>However different we and our family groups, our tribes, have become, we can and do still savor that sense of alignment and attunement and have our own ways of articulating some sort of consonance between ourselves, our intimate groupings, and the universe that surrounds us. Though such patternings may be deconstructed, they seem to emerge from a deep need that recurs across generations, like a persistent internal rhyme, and poetry, this most nuanced way of making with words, is a way in which that need for attunement is repeatedly articulated through language. If prehistoric sites often appear to relate people to the stars and planets, then poems continue that impulse. (61)
+ </blockquote>
+ <p>Ancient tombs, then, prompt us to ponder our place in the universe, our identity as humans, and in that also they resemble literature. According to Kenneth Brophy, Neolithic monuments &ldquo;were and are locations that embodied the biography of the builders, users, spectators, and excavators&rdquo; (10). It follows that if we think of Maeshowe as a text, Brophy&rsquo;s assertion that the monument absorbs the &ldquo;biography&rdquo; of all who have used it or visited it, positions it as an example of intertextuality. Maeshowe has many constantly changing stories to tell to its different readers, and readers will respond differently to its figurative meanings.</p>
+ <p>[6] In a 1977 column for <u>The Orcadian</u> newspaper, George Mackay Brown describes how witnessing the midwinter solstice at Maeshowe affects him: &ldquo;Winter after winter I never cease to wonder at the way primitive man arranged, in hewn stone, such powerful symbolism&rdquo; (&ldquo;Maeshowe at Midwinter&rdquo; 88). Like Richards, Brown is emphasizing the figurative qualities of the structure, which he has further explored in poetry. However, the first of his 1999 &ldquo;Two Maeshowe Poems&rdquo; (often printed as a stand-alone) opens not at the tomb, but with an image of the neighboring stone circle, Brodgar. Perhaps surprising to most readers, this would resonate with archaeologists since current scholarship emphasizes that the sites comprising The Heart of Neolithic Orkney are not self-contained but exist and function in relation to one another and to the surrounding landscape (See &ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; 5). As such, they should not be interpreted as discrete entities. It is fitting, then, that Brown&rsquo;s poem moves seamlessly through a series of images that integrate Brodgar&rsquo;s &ldquo;light and darkness&rdquo; with Maeshowe&rsquo;s &ldquo;flowers [and] stone&rdquo; (a reference to the runic graffiti carved by Vikings inside the tomb) and &ldquo;skulls&rdquo; (Lines 1, 9, 11). The first word of the poem, &ldquo;Circle,&rdquo; is semantically echoed in the initial word of each ensuing stanza, &ldquo;Ring,&rdquo; &ldquo;Wheel,&rdquo; and &ldquo;Round,&rdquo; subtly shifting from the geometrically circular Brodgar to the tumescent mound of Maeshowe and emphasizing the cycle of &ldquo;life and death&rdquo; (7). For this is a poem about regeneration, how &ldquo;Out of those skulls / Breaks the first green shoot, the full ear, then the bread&rdquo; (11-12). Throughout, juxtaposed images look for the positive to outweigh the negative: &ldquo;We move in shadows,&rdquo; but &ldquo;Brodgar has burned on the moor a dance of sun&rdquo;; &ldquo;Ring of quern and plough&rdquo; (a quern is a stone for grinding grain) are charged to &ldquo;contain / Our tumults of blood&rdquo;; &ldquo;The stars&rsquo; chaos is caught in a strict rein&rdquo;; the word &ldquo;stone&rdquo; is enveloped by &ldquo;flowers,&rdquo; and &ldquo;beauty and love&rdquo;; similarly, &ldquo;snow&rdquo; is flanked by &ldquo;sun&rdquo; and &ldquo;seed.&rdquo; So darkness becomes light, destructive violence is subservient to the raising and grinding of grain for bread, order makes sense of the universe, the beautiful and the warm temper the hard and the cold, and new life will follow death.</p>
+ <p>[7] Brown&rsquo;s interpretation of these monuments, his use of the architectural circularity and roundness of the Ring of Brodgar and Maeshowe as metaphors for the lifecycle and the possibility of renewal, is shared by archaeologists, who despite its being a burial site, have also associated Maeshowe and its rituals with the agricultural year. Neolithic people were not nomadic but had gradually become settled farmers, living by the routines and rhythms of the seasons, which, according to Richards, constituted &ldquo;an analogy with the human life cycle and past generations&rdquo; (&ldquo;Doorways&rdquo; 65). Time&rsquo;s passage was the organizational framework for survival as well as mortality, and the tombs, he writes, were &ldquo;a metaphorical extension of daily life&rdquo; (&ldquo;Doorways&rdquo; 76). Trevor Garnham, an architect, develops that idea further: &ldquo;Burying bones in the earth was perhaps to seek some metaphoric relationship with the planting of seeds. In its maturity and death, the seed containing the essence of its own renewal served as the inspiration for the hope of life&rsquo;s rebirth in some other form&rdquo; (87). In pairing skeletal remains with seeds as an expression of hope for the future, Garnham&rsquo;s analogy is comparable to the positive final image of Brown&rsquo;s poem, the &ldquo;skulls&rdquo; engendering the &ldquo;green shoots&rdquo; and the &ldquo;bread&rdquo; of life.</p>
+ <p>[8] Brown had written earlier of Maeshowe in his 1996 poem, &ldquo;Maeshowe: Midwinter,&rdquo; choosing then to focus on the solstice. However, the imagery here is not rooted in the agricultural cycle, the earthly world of querns, ploughs, and bread; instead, he connects the pre-Christian tomb to the Christian calendar. The opening phrase, &ldquo;Equinox to Hallowmass,&rdquo; immediately integrates the astronomical with the sacred, giving the season of &ldquo;darkness&rdquo; both physical and spiritual dimensions (1). The religious imagery continues in the second stanza as it evokes &ldquo;St Lucy,&rdquo; whose feast day falls on the shortest day of the year (6). She is portrayed as a weaver whose &ldquo;shuttle&rdquo; creates &ldquo;a dark web&rdquo; that &ldquo;fills the loom&rdquo; (7-9), placing at the centre of the poem a world in which light is completely absent: &ldquo;The blackness is solid as a / stone that locks a tomb. / No star shines there&rdquo; (10-12). To be in such a void, with no guiding star, would seem like a moment of psychological despair, yet just as the days begin to lengthen immediately after the solstice, the poem also brightens. The moment when the sun enters the passage is the &ldquo;true ceremony,&rdquo; suggesting that perhaps the pagan reverence for nature carries particular authenticity. Then &ldquo;the last fleeting solstice flame&rdquo; is &ldquo;caught up,&rdquo; leading to an optimistic note as the children&mdash;the future&mdash;sing with &ldquo;voices like leaves of light&rdquo; (19). Again, the poem ends with an image of rebirth, but its tone is less biological and more cosmological.</p>
+ <p>[9] While Brown&rsquo;s poems use these dual frames of reference in order to explore the themes of regeneration that Maeshowe expresses, the biological and cosmological are not at odds. Garnham defines the cosmos as &ldquo;an all-encompassing world of things and phenomena [. . . .] The essential character of this early form of cosmos bound every aspect of a people&rsquo;s life into reciprocal relationships with the forces that give shape to their world&rdquo; (9). The central argument of his book places Neolithic Orkney in this context. Similarly, reading Brown&rsquo;s two Maeshowe poems together reveals that the &ldquo;green shoot&rdquo; which produces the &ldquo;bread&rdquo; corresponds to the youthful &ldquo;voices like leaves of light.&rdquo; In fact, his insertion of &ldquo;leaves,&rdquo; with its agrarian connotations, into that final line establishes the connection, recognizes that the complex architectural system of domestic houses, burial chambers, and stone circles symbolizes the idea that the activities for which they were designed&mdash;working, eating, loving, sleeping, worshipping, dying, and the possibility of rebirth&mdash;are the web of human existence. The physical bread and the metaphysical song are one.</p>
+ <p>[10] In their respective responses to Maeshowe, Kathleen Jamie, Myra Schneider, and Dilys Rose also address the theme of the cycle of life and death. Jamie&rsquo;s essay, &ldquo;Darkness and Light,&rdquo; describes a quest: she seeks a good, positive darkness because, in the 21st century, it has become impossible &ldquo;to see the real dark for the metaphorical dark . . .the death-dark.&rdquo; Enjoyment of the &ldquo;natural, courteous dark,&rdquo; she has come to believe, has been squeezed out by the Christian belief in a metaphorical darkness that stands for the opposite of salvation (9-10). However, as she is planning this trip, a friend points out that &ldquo;Maes Howe is a metaphor,&rdquo; perhaps exposing a flaw in Jamie&rsquo;s thinking: possibly the natural and metaphorical darknesses are inseparable (10 emphasis added). Although her visit to Maeshowe takes place a couple of days before the solstice, the artificial lights of a surveyor&rsquo;s crew assault her eyes, so she rediscovers no &ldquo;courteous darkness&rdquo; and witnesses &ldquo;no resurrecting beam of sunlight&rdquo; (19). Nevertheless, through Maeshowe, she becomes reconciled to the conventional negative concept of darkness. In terms of &ldquo;wonder&rdquo; similar to Brown&rsquo;s in <u>The Orcadian</u>, she asks, &ldquo;Were they the first people . . . to articulate this metaphor of light and dark, of life and death?&rdquo; and reflects upon its significance:</p><blockquote>For five thousand years we have used darkness as the metaphor of our mortality. We were at the mercy of merciless death, which is darkness. When we died, they sent a beam of midwinter light in among our bones. What a tender, potent gesture. In the Christian era, we were laid in our graves to face the rising sun. We&rsquo;re still mortal, still don&rsquo;t want to die, don&rsquo;t want our loved ones to die. (19-20)
+ </blockquote>
+ <p>Her rejection of a metaphor that she has considered &ldquo;[worn] out&rdquo; and &ldquo;redundant&rdquo; (4, 9) turns out to have been less literary and more personally psychological, for Jamie&rsquo;s visit to the tomb leads to her acceptance of mortality. Whereas previously she has blamed Christianity, she now appreciates that the Christian concept of darkness is part of a continuum of dread traceable back to Neolithic times and forward to our own. The &ldquo;tender, potent gesture&rdquo; of the light penetrating the dark of the tomb, therefore, offers consolation, ameliorating our most profound fears (20).</p>
+ <p>[11] In her poem, &ldquo;Maeshowe,&rdquo; Myra Schneider also describes a guided tour of the cairn, during which the speaker uses the second person singular to address a hypothetical visitor, initially giving the sense that to enter the burial place feels like death as the &ldquo;chill seeps into your body&rdquo; (14). However, this ominous impression is immediately dismissed because &ldquo;a stillness that&rsquo;s other than death inhabits / this place where the undead gather to greet the dead&rdquo; (15-17). The journey through the passage will take &ldquo;you&rdquo; to a place that is not oblivion but, instead, is where the living may consort with their ancestors. Again, the boundary between life and death, which can seem so irrevocable, becomes less absolute and, therefore, less threatening. After the visit is over, its impact will remain, and the speaker imagines her visitor&rsquo;s memories:</p><blockquote>In midwinter you&rsquo;ll visualize the sun piercing the dark that swaddles seeds, see it falling on the aligned entrance, its white shine splitting to burnish the passage wall, flood the ground with gold. (22-26)
+ </blockquote>
+ <p>These images recall Garnham&rsquo;s theory: that the burial of bones is connected metaphorically to the planting of seeds. In the speaker&rsquo;s memory, the dark cradles seeds, the germ of life, rather than bones. Once sunlight enters the tomb, a radiant moment occurs in which the &ldquo;ground&rdquo; will turn &ldquo;gold,&rdquo; like a field of ripe grain. Schneider&rsquo;s poem, like Brown&rsquo;s, affirms the archaeological reading of Maeshowe as a place of renewal, but in this case that renewal goes beyond the promise of the agricultural cycle. An individual will be able to experience, perhaps during times of psychological or spiritual gloom, the moment of glory when the sun is &ldquo;piercing / the dark.&rdquo; There is a Romantic quality to these lines: Maeshowe will stay with Schneider&rsquo;s speaker as those daffodils stay with Wordsworth, &ldquo;to flash upon the inward eye / That is the bliss of solitude,&rdquo; to stimulate the imagination (24). Having herself benefited from the tomb&rsquo;s restorative qualities, the speaker is inspired to spread the word, to share her revelation with &ldquo;you,&rdquo; the reader.</p>
+ <p>[12] Besides the drama of the solstice, another inspirational feature of Maeshowe is the Viking runes carved on the interior walls. Referring to these inscriptions as &ldquo;The first island poems,&rdquo; Brown quotes them emphatically in the second of the paired poems: &ldquo;INGIBIORG IS THE LOVELIEST GIRL / HERMUND WITH A HARD AXE CARVED RUNES&rdquo; (&ldquo;Two&rdquo; 13, 18-19). Many have been struck by the simple humanity of these statements, as well as the paradox inherent in this lusty youthful scrawling being hidden in a tomb. Dilys Rose, in &ldquo;Maeshowe Nipple,&rdquo; for instance, lists the prosaic concerns of the Vikings, portraying them as &ldquo;intrepid&rdquo; but also homesick, missing &ldquo;sweethearts and family&rdquo; (4, 9). At the ends of their respective poems, both Brown and Rose emphasize that Maeshowe was merely a temporary shelter for the Vikings: the &ldquo;young seamen climbed out of Maeshowe, / Their nostrils wide to the salt wind&rdquo;; &ldquo;the dragon boats moved on&rdquo; (Brown &ldquo;Two&rdquo; 23-24; Rose 11). Crawling out of the subterranean tomb and heading for further maritime adventures, the men re-enter the world, extending the overall theme of regeneration. Brown, as we have seen, has already linked the tomb with the life-giving promise of &ldquo;the first green shoot, the full ear, then the bread&rdquo; in the first of these paired poems. Rose, in similar terms, also connects the Viking runes with the reassuring knowledge that there will be a crop next year: over the centuries, &ldquo;their tongue / took root and sprouted from invaded soil / green words for <u>Father</u>, <u>Daughter</u>, <u>Bread</u>&rdquo; (11-13). Here, in the final lines, the Viking vocabulary is fresh and verdant, a harbinger of new human life and the grain that nourishes it. Since runic characters are &ldquo;straight-branched&rdquo; (Rose 4), they resemble rows of rudimentary skeletal stick figures which have been buried in the tomb. The bony runes, therefore, have become metaphorical seeds, and Rose&rsquo;s speaker, like Garnham, sees hope in the bone/seed analogy.</p>
+ <p>[13] It is clear, to summarize briefly, that these four creative writers read Maeshowe much as archaeologists and historians of architecture have done, as an expression of hope for the future, particularly in relation to the coming of spring, but also at a more personal level. The texts suggest that to visit these tombs is, as Richards also emphasizes, transformative. Like their ancestors, contemporary visitors are changed, in some manner revitalized, especially if they witness the sun&rsquo;s midwinter alignment, which Brown describes as a &ldquo;pledge of renewal, a cry of resurrection&rdquo; (&ldquo;Maeshowe in Midwinter&rdquo; 88). However, in the work of Jamie, Schneider, and Rose, a further, more political restoration is at work, for all three use images equating Maeshowe with the female body.</p>
+ <p>[14] Kathleen Jamie states early in her essay, &ldquo;We are conceived and carried in the darkness,&rdquo; emphasizing the positive, life-giving qualities of the dark, and inviting the reader to see Maeshowe as a uterus (4). The womb/tomb imagery is developed further when she eroticizes the winter solstice as &ldquo;a complicit kiss,&rdquo; during which &ldquo;the beam of the setting sun shines along the passage, and onto the tomb&rsquo;s back wall&rdquo; (12). When she goes inside the tomb, she expects &ldquo;not utter darkness, but perhaps a wombish red&rdquo;; however, this is denied her because of the lights of the surveyors, one of whom is &ldquo;folded, foetus-like, into the little cell in the back wall&rdquo;: a foetus implanted in the very place where the sunbeam strikes (12,13). When Jamie leaves, she describes taking &ldquo;the smallest and most challenging of journeys, squeezing down a passageway and out into the world of sound and moving air&rdquo; (17). The tunnel that admits the beam has become a birth canal, so Jamie&rsquo;s transformation is not only her intellectual reassessment of the metaphorical value of darkness; she visualizes her own rebirth in more literal terms too, with Maeshowe cast as the mother.</p>
+ <p>[15] Myra Schneider&rsquo;s &ldquo;Maeshowe&rdquo; also hints that to visit the tomb is to return to the womb when the speaker remarks that although &ldquo;you&rdquo; are part of a tour group, you will realize that you are &ldquo;alone&rdquo; and have &ldquo;never travelled so far back / so far in&rdquo; (8-10). This analogy is made more explicit later in the poem when the sun enters the passage: &ldquo;In that deep chamber / you&rsquo;ll be bathed in red, not the red spilt in hatred&mdash;/the red that&rsquo;s birth, the heart looming with the blood&rdquo; (24-28). In the vision that the speaker evokes for the visitor&rsquo;s memory, therefore, the &ldquo;dark that swaddles seeds&rdquo; not only nurtures and protects the grain that will ripen into crops, but also the fertilized ovum (23). With no dazzling and intrusive surveyors&rsquo; lights, Schneider suggests that it is possible for us to experience the &ldquo;wombish red&rdquo; that was denied Jamie, blood that is the force of life rather than the mark of violence.</p>
+ <p>[16] Dilys Rose&rsquo;s poem, &ldquo;Maeshowe Nipple,&rdquo; on the other hand, in addressing the Viking use of the tomb, acknowledges that violence has taken place. The title, of course, immediately signals that Maeshowe is female, and the opening lines graphically describe the tomb&rsquo;s external anatomy: a &ldquo;breast,&rdquo; with an &ldquo;aureola / sandy-rimmed, the nipple leaking a pale trail / to hidden chambers&rdquo; (1-3). Within, Maeshowe&rsquo;s chambers have been &ldquo;invaded&rdquo; by men who &ldquo;inscribed their conquests&rdquo; and &ldquo;totted up the loot&rdquo; (12, 4, 6). Even though the poem has initially compared the cairn to a breast rather than a womb, this seems like a rape or an assault by men exercising their power and keeping track of their plunder. As human and homesick as the poem presents the young men, it does not forget that their presence in Maeshowe is as uninvited intruders who leave their runic seeds carved into the chamber walls.</p>
+ <p>[17] To make sense of this pattern of imagery, it is helpful to turn to an earlier female author, similarly inspired by her visit to a Neolithic site. Naomi Mitchison wrote <u>Early in Orcadia</u> after a friend took her to another of Orkney&rsquo;s chambered tombs, Isbister, which has no passage entry, because &ldquo;she knew it would waken something in me&rdquo; (8). Set in Neolithic times, the novel follows a family and its descendants as they settle on Orkney, establish homes and villages, and erect the monuments in which they practice their religious rituals. Mitchison depicts the cairns predating the stone circles (both Isbister and Maeshowe are, in fact, thought to have been built before Brodgar) and imaginatively describes the changing beliefs prompting these architectural developments. Tradition holds that pregnant women must visit the tomb in order that the ancestral spirit will be passed to their children (132). One woman, Ba, making this journey, reflects that a &ldquo;few moons&rdquo; have passed since she became pregnant and stopped menstruating. She also knows that a powerful goddess, &ldquo;the big bad Moon Woman had once had an honouring place,&rdquo; had watched over the dead (119). However, the Moon Woman has been supplanted by the sun. The burial place was &ldquo;pulled apart and scattered by the Sun Man and the bulls. After that came the beginning of their own honouring place where the bones lay and where you must go down on your knees before you could get in&rdquo; (119). The later passage cairn, then, is a creation of the masculine sun, the same sun that shines down the passageway at midwinter. Accompanied by bulls, also male, the Sun Man has ravaged the Moon Woman&rsquo;s tomb and designed a new one to suit his own needs. Even so, the burial place is still associated with female fertility. Nervously, Ba enters &ldquo;on her hands and knees . . . under and between great stones.&rdquo; Once inside, though, she thinks of the moments before she conceived her child: &ldquo;She was waiting, almost as she had waited in the soft sand behind that rock in the sun-warmed geo a few moons back&rdquo; (130). For Ba, the tomb is not frightening. She recalls not a violent rape, but a loving encounter, and the darkness feels as warm as the &ldquo;geo&rdquo; (an Orcadian word referring to a deep, narrow fissure in a cliff) where she met her lover. Following her memory of the moment of conception, she is &ldquo;push[ed] . . . back, back to the way out, back to the square of light, to the way out into the real world on hands and knees as one must&rdquo; (130). Like Jamie, Ba is compelled to crawl, to battle her way through the passage to be reborn.</p>
+ <p>[18] By the end of <u>Early in Orcadia</u>, the stone circle, with its emphasis on light rather than dark, is becoming the ultimate manifestation of the transfer of power from the Moon Woman to the Sun Man. Its significance is explained by the &ldquo;Great Man,&rdquo; who is &ldquo;painted with sun circles,&rdquo; to Moon Woman after he has summoned her to his presence: &ldquo;The great tall stones . . . were so raised to show the way of the sun, who is our master and our maker&rdquo; (169). Moon Woman, however, is aware of the injustice of this arrangement: &ldquo;They said that the moon was the servant of the sun, to do what he wanted, but that, Moon Woman knew, was not right. In her own mind she unsaid it&rdquo; (170). At first she is jealous and afraid, but the final vision of the novel is hers, and it is, to an extent, a reconciliation of powers:</p><blockquote>If I were to say a few small and easy words to the Great Man, if I were to move myself in a certain way, then we would be sun and moon. Then I would put my fingers onto the colour, onto that knife, onto his eyes, . . . eyes, onto that round, shining sun that hangs over his heart, fingering it so that my fingers would meet his, me going . . . onto all parts of him. He would be mine as the sun is the moon&rsquo;s. (176)
+ </blockquote>
+ <p>She is picturing an intertwining of sun and moon, of masculine and feminine&mdash;a consummation. The partnership is not one of complete equality, though, for she also envisions not that the sun will be the master and the moon the servant, but that he will be hers, that the moon will possess the sun, that her status will be restored.</p>
+ <p>[19] Mitchison&rsquo;s fictional representation of light/sun/man emerging as the object of worship and awe, assuming the rank previously held by dark/moon/woman, is an idea rooted across cultures: &ldquo;A fundamental polarity in many creation myths,&rdquo; according to Trevor Garnham, &ldquo;contrasts the dark, fecund, harbouring earth with the up-drawing sun.&rdquo; (145). He points out, for example, that &ldquo;by the time of the Celtic occupation of Britain, there were well-established beliefs and practices focused on the sun&rdquo; and that in Norse mythology, &ldquo;a male hierarchy supplanted older, matriarchal law&rdquo; (161, 109). Analyzing the archaeological sites within this paradigm, Garnham argues, supports the theory that religious practice fundamentally changed along with the architecture, that &ldquo;ritual activity associated with burial cairns became transferred to stone circles&rdquo; (152).</p>
+ <p>[20] Maeshowe, however, suggests a mid-point in this ritualistic shift because although, like earlier stalled cairns, it is dark and womb-like, its annual climactic moment is when the sun lights up the passage. Garnham sees the Neolithic architecture of Orkney as a progression. The first structures, the houses, were purely domestic; they had a &ldquo;nurturing role&rdquo; (66). The houses at the coastal village site, Scara Brae, therefore, &ldquo;seem to be fundamentally powerful symbols of protection and gathering, echoing that of the pot and the basket&rdquo; (70). Since the manufacture of both pots and baskets was the work of women, Garnham is reading the houses as essentially feminine. They were vessels, their stone walls embanked by earth. Both Garnham and Richards point out that the houses were models for the tombs: the passage graves are structurally similar to the houses at Scara Brae, and both were covered with turf (Garnham 48; Challands, Muir &amp; Richards 242, 245). Cairns of the Maeshow type, with passage entries, however, were the later forms. The earlier stalled structures, such as Midhowe, on the island of Rousay, did not feature the tunnel entrance.</p>
+ <p>[21] Archaeologists do not agree on the social significance of passage cairns and sun circles, the extent to which their development reveals a move to a more hierarchical society. Challands, Muir, and Richards state, &ldquo;In many ways, everything about the architecture of Maeshowe enforces a notion of separation, division, and restriction&rdquo; (247). Elsewhere, Richards and another co-writer are more guarded. They point out that the tomb resembles House 2 at the nearby Barnhouse settlement, a larger house than any at Scara Brae that was probably &ldquo;highly restricted on the basis of an individual&rsquo;s status, probably additionally defined in terms of age and gender.&rdquo; However, they also warn that there is insufficient archaeological evidence to &ldquo;leap to conclusions about a patriarchal group of &lsquo;elders&rsquo; who used knowledge as a commodity to maintain their power over women and younger men&rdquo; (Muir &amp; Richards 204). Although cautious, they do acknowledge that &ldquo;power and authority,&rdquo; probably based on &ldquo;cosmological beliefs,&rdquo; would have been necessary to build the monuments (199). Leaning not only on physical but also anthropological evidence, Garnham&rsquo;s view, on the other hand, is that the more formal structure <u>does</u> support the idea of hierarchy and that the estimated 100,000 man/hours that would have been necessary to build it point to a more complex social structure that had to extend beyond the local community (128). Furthermore, he writes, the layout of individual chambers &ldquo;can be read as a metaphor of primogeniture&rdquo; (74). Like Richards, Garnham interprets the passage as a symbol of privilege because it was hard to get inside. However, citing Eliade&rsquo;s <u>Patterns in Comparative Religion</u>, he also emphasizes that there is &ldquo;a close connection between solar theology and the elite&rdquo; (163). In this context it seems that &ldquo;allowing access to the sun . . . was more important that [sic] allowing access to members of the tribe&rdquo; (131-132).</p>
+ <p>[22] Maeshowe can be seen, then, as expressing a point of tension between earth and sun in which the dark tomb is literally infiltrated by solar rays on one day only. The subsequent building of the Circle of Brodgar elevates the stature of the sun. Fully above ground, the center of its astronomical and religious year occurs not in December, but in June, at the midsummer solstice. Garnham points out that while a smaller circle, the Stones of Stenness, is open to the sun at its &ldquo;point of maximum power,&rdquo; Maeshowe allows the sun inside only when it is &ldquo;at its lowest ebb.&rdquo; Except at midwinter, &ldquo;the tomb is dark, cold, and filled with white bones, echoing the whiteness of the moon&rdquo; (207). Although Stenness actually predates Maeshowe by perhaps 400 years, throwing off the neat chronology of <u>Early in Orcadia</u>, Garnham&rsquo;s interpretation of Maeshowe and the stone circles parallels Mitchison&rsquo;s literary response to the Isbister tomb: compared to earlier cairns, Maeshowe is a more patriarchal development, the passageway allowing the masculine sun to displace the feminine &ldquo;whiteness of the moon,&rdquo; and yet the bones, the metaphorical seeds, still lie dormant; the presence of Moon Woman endures.</p>
+ <p>[23] Although <u>Early in Orcadia</u> ends with Moon Woman&rsquo;s vision of a mingling of sun and moon, of masculine and feminine, there is a note of uncertainty as she asks herself, &ldquo;Should I, then?&rdquo; (176). She does not ask &ldquo;Can I?&rdquo; but &ldquo;Should I?&rdquo; Her question is not whether she is personally capable, but whether it would be wise to challenge the elite power structure in the name of justice. Readers are left without an answer, but since women are still fighting for equality in the institutions of politics and religion, it is reasonable to assume that if Moon Woman did attempt it, she met with a great deal of resistance. It is with this in mind, then, that we can return to the Maeshowe experiences of Jamie, Schneider and Rose. Their visits to the cairn suggest that to see it merely as a symbol of agricultural regeneration or even more broadly of hope, is incomplete. Something more needs to be resurrected, and their use of the female imagery effectively acknowledges and reclaims a feminine narrative for Maeshowe. In Rose&rsquo;s poem, 12th century Vikings may take up residence inside, but 900 years later, the reader is instructed to &ldquo;See,&rdquo; to bear witness to &ldquo;a green breast in a green field,&rdquo; the most nurturing part of a woman&rsquo;s body surrounded by the new growth of spring (1). When Schneider refers to the &ldquo;red that&rsquo;s birth&rdquo; rather than the &ldquo;red spilt in hatred,&rdquo; and describes how the sun will &ldquo;burnish the passage wall, / flood the ground with gold&rdquo; and, similarly, when Jamie refers to the &ldquo;complicit kiss,&rdquo; it is as if Moon Woman&rsquo;s consummation has finally taken place and justice restored.</p>
+ <p>[24] Richards asks where the doors of tombs lead, to what &ldquo;revelation.&rdquo; Indeed, the creative writing of Jamie, Schneider, and Rose transports readers through Maeshowe&rsquo;s entryway towards &ldquo;revelation.&rdquo; Their collective responses help us to recognize the humanity of Neolithic peoples, to appreciate how common experiences connect us to the past. They ask us to consider the roots of sexual discrimination, the possible marginalization of women 5000 years ago. More universally, they honor the memory of displaced matriarchal societies and, thus, prompt us to reflect on the status of women today. While, as Hall points out, male authors of the mid-twentieth-century Scottish Literary Renaissance had a nationalist political agenda, &ldquo;looking for Scotland in Scotland&rsquo;s prehistory&rdquo; (160), these female writers look to the past for a feminist renewal, both personal and political. As such, their interpretations complement and illuminate those of archaeologists. Naomi Mitchison, acknowledging that she may be &ldquo;treading on the toes of archaeologists,&rdquo; points out that their physical &ldquo;evidence may not always offer a clear interpretation, in fact it very seldom does&rdquo; (113). For despite their painstaking sifting (both literal and figurative) of physical evidence, archaeologists must, finally, apply their own imaginations.</p>
+ <p>[25] Archaeologists themselves recognize the uncertainty inherent in drawing conclusions about ancient societies from the surviving fragments of their lives. In reference to the recent discovery of a complex of temples at the Ness of Brodgar, Richards has said, &ldquo;This was a ceremonial centre, and a vast one at that. But the religious beliefs of its builders remain a mystery&quot; (qtd. in McKie). In fact, the excavation of this temple complex is prompting a reassessment of the entire Heart of Neolithic Orkney. Tom Muir, of the Orkney Museum, goes so far as to assert that &quot;the whole text book of British archaeology for this period will have to be torn up and rewritten from scratch thanks to this place&quot; (qtd. in McKie). Even as archaeologists, using sophisticated technology, scrape away the dust of time from this long-buried site, it remains true that &ldquo;Insights can only come from interpretation&rdquo; (Jones and Richards 195). It is in this interpretative arena that science must join forces with the arts and humanities in the search for knowledge, for a fuller understanding.</p>
+ <p>[26] George Mackay Brown has written, &ldquo;People in 2000 AD are essentially the same as the stone-breakers [. . .] of 3000 BC&rdquo; (&ldquo;Brodgar Poems&rdquo; lines 10-12). Knowing where we have come from, fleshing out our understanding of the prehistoric world and, therefore, ourselves, takes the skills and multiple perspectives not only of scientists, archaeologists, architects, and anthropologists, but also essayists, poets, and more. The interdisciplinary synergy involved in comparing archaeological, anthropological, and literary interpretations of Maeshowe sheds light on the shadows of the past, raises questions about the more elusive shadows of Neolithic women, and provides historical context for our understanding of gender relations across time. Like crawling through the passage into the dark and out to the light, the empirical and literary journeys into the mysteries of Maeshowe are indeed transformative, exhuming the bones of the past that we may better nurture the seeds of the future.</p>
+ <p>ACKNOWLEDGEMENTS. Thanks are due to Edward Gale Agran, Stephen Potthoff, and the anonymous reviewers for their time and valued advice. </p>
+ <p align="center">WORKS CITED</p>
+ <p>Bevan, Archie, and Brian Murray. Eds. <u>The Collected Poems of George Mackay Brown</u>. London: John Murray, 2005. Print.</p>
+ <p>Brown, George Mackay. &ldquo;Brodgar Poems (1992).&rdquo; In Bevan and Murray.308-312. Print.</p>
+ <p>---. &ldquo;Maeshowe: Midwinter.&rdquo;1996. In Bevan and Murray. 320. Print.</p>
+ <p>---. &ldquo;Maeshowe at Midwinter.&rdquo; 1977. <u>Under Binkie&rsquo;s Brae</u>. Edinburgh: Gordon Wright Publishing, 1979. 87-88. Print.</p>
+ <p>---. &ldquo;Two Maeshowe Poems.&rdquo; 1999. In Bevan and Murray. 420-421. Print.</p>
+ <p>Card, Nick, et al. &ldquo;Bringing a Landscape to Life? Researching and Managing &lsquo;The Heart of Neolithic Orkney&rsquo; World Heritage Site.&rdquo; <u>World Archaeology</u> 39.3 (2007): 417-435. EBSCO <u>Academic Search Complete</u>. Web. 29 Jun. 2011.</p>
+ <p>Challands, Adrian, Tom Muir, and Colin Richards. &ldquo;The Great Passage Grave of Maeshowe.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 229-248. Print.</p>
+ <p>Crawford, Robert. &ldquo;Maes Howe Sappho.&rdquo; <u>Yale Review</u>: 95.1 (2007): 60-65. OhioLINK Electronic Journal Center. Web. 29 Jun. 2011.</p>
+ <p>Garnham, Trevor. <u>Lines on the Landscape, Circles from the Sky: Monuments of Neolithic Orkney</u>. Stroud, Gloucestershire: Tempus, 2004. Print.</p>
+ <p>Hall, Simon W. <u>The History of Orkney Literature</u>. Edinburgh: John Donald/Birlinn Ltd., 2010. Print.</p>
+ <p>&ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; Historic Scotland. 2008. EBSCO <u>Academic Search Complete</u>. Web. 30 Jun. 2011.</p>
+ <p>Jamie, Kathleen. &ldquo;Darkness and Light.&rdquo; <u>Findings: Esssays on the Natural and Unnatural World</u>. Ed. Jamie. St. Paul, MN: Graywolf, 2005. 3-22. Print.</p>
+ <p>McKie, Robin. &ldquo;Neolithic Discovery: Why Orkney is the Centre of Ancient Britain.</p>
+ <p><u>The Guardian / The Observer</u>. 6 Oct. 2012. Web. 16 Mar. 2013.</p>
+ <p>Mitchison, Naomi. <u>Early in Orcadia</u>. Glasgow: Richard Drew, 1987. Print.</p>
+ <p>Jones, Si&acirc;n, and Colin Richards. &ldquo;The Villagers of Barnhouse.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 195-204. Print.</p>
+ <p>Richards, Colin. &ldquo;Doorways into Another World: The Orkney-Cromarty Chambered Tombs.&rdquo; <u>Vessels for Ancestors: Essays on the Neolithic of Britain and Ireland in Honour of Audrey Henshall</u>. Ed. Niall Sharples and Alison Sheridan. Edinburgh: Edinburgh UP, 1992. 62-76. Print.</p>
+ <p>Riddoch, Lesley. &ldquo;Stone Age Marvels Which Inspire and Astonish: Wonders of Scotland.&rdquo; <u>The Scotsman</u>. 13 Feb. 2006. Web. 30 Jun. 2011.</p>
+ <p>Rose, Dilys. &ldquo;Maes Howe Nipple.&rdquo; <u>Bodywork</u>. Edinburgh. Luath Press, 2007. Print.</p>
+ <p>Schneider, Myra. &ldquo;Maeshowe.&rdquo; <u>Circling the Core</u>. London: Enitharmon Press, 2008. 23-24. Print.</p>
+ <p>Wordsworth, William. &ldquo;I wandered lonely as a cloud.&rdquo; <u>The Norton Anthology of English Literature</u>. Eighth Ed. Ed. Stephen Greenblatt and M.H. Abrams. New York: Norton, 2006. 305-306. Print.</p>
+<p><strong>Contributor's Note</strong></p>
+ <p><strong>CHARLOTTE FAIRLIE</strong> teaches English at Wilmington College, in Wilmington, Ohio. Her published work focuses on Scottish literature and rural life in literature. She is currently co-editing an anthology of poetry relating to scythes and mowing.</p></td>
+ <td valign="top"><center>
+ <a href="../index.html"> <img src="../image/btncu.gif" alt="Current Issue" border="0" height="42" width="79"></a><br>
+ <a href="../download.html" tppabs="http://www.genders.org/download.html"> <img src="../image/btndo.gif" alt="Download" tppabs="http://www.genders.org/image/btndo.gif" align="bottom" border="0" height="42" width="115"></a><br>
+ <a href="../edit.html" tppabs="http://www.genders.org/edit.html"> <img src="../image/btned.gif" alt="Editorial Board" tppabs="http://www.genders.org/image/btned.gif" align="bottom" border="0" height="50" width="80"></a><br>
+ <a href="../guide.html" tppabs="http://www.genders.org/guide.html"> <img src="../image/btngu.gif" alt="Contributor Guidelines" tppabs="http://www.genders.org/image/btngu.gif" align="bottom" border="0" height="42" width="90"></a><br>
+ <a href="../recent.html"> <img src="../image/btnre.gif" alt="Recent Issues" tppabs="http://www.genders.org/image/btnre.gif" align="bottom" border="0" height="41" width="79"></a><br>
+ <a href="../link.html"> <img src="../image/btnli.gif" alt="Links &amp; Books" border="0" height="46" width="97"></a><br>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <table width="500">
+ <tbody>
+ <tr>
+ <td><p><a href="../download.html">Copyright</a> ©2010 Ann Kibbey.
+
+ All Rights Reserved Worldwide.<br>
+ </p>
+ <p> </p>
+ <center>
+ <a href="../download.html"><font size="1">Download</font></a><font size="1"> || <a href="../edit.html">Editorial Board</a> || <a href="../guide.html">Submission
+
+ Guidelines</a> || <a href="../index.html">Current Issue</a> || <a href="../recent.html">Recent Issues</a> || <a href="../link.html">Links
+
+ &amp; Books</a></font>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <p></p>
+ <p align="right">
+
+ <table width="550">
+ <tbody>
+ <tr>
+ <td width="361"></td>
+ <td width="72"><p><img src="../image/algosmlr.gif" alt="Genders" align="bottom" border="0" height="72" width="72"> </p></td>
+ <td width="101"><b> <font size="1">Genders Journal</font></b> <font size="1"><br>
+ 226 UCB<br>
+ University of Colorado<br>
+ Boulder, CO 80309<br>
+ http://www.Genders.org</font></td>
+ </tr>
+ </tbody>
+ </table>
+ </p>
+ <p align="right"></p></td>
+ </tr>
+ </tbody>
+</table>
+</p>
+<p></p>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
new file mode 100644
index 0000000..b47f85b
--- /dev/null
+++ b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
@@ -0,0 +1,66 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName><forename type="first">N</forename><surname>Goodman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Implicit definition sustained</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The ways of paradox and other essays</title>
+ <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting>
+ <imprint>
+ <publisher>Harvard University Press</publisher>
+ <date type="published" when="1976">1976b</date>
+ <biblScope unit="page" from="133" to="136" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title>
+ <author>
+ <persName><forename type="first">B</forename><surname>Russell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1906">1906</date>
+ <publisher>Proceedings of London Mathematical Society</publisher>
+ <biblScope unit="volume">4</biblScope>
+ <biblScope unit="page" from="29" to="53" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/grobid_refs_s1047951103000064.tei.xml b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
new file mode 100644
index 0000000..e0eae8a
--- /dev/null
+++ b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
@@ -0,0 +1,499 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">The community control of rheumatic fever and rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">N</forename><surname>Dondong</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Elkholy</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="285" to="294" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note>Report of a WHO international co-operative project</note>
+ <note type="raw_reference">Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285–294.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Rehman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="185" to="192" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185–192.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever in Sudanese children</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Ismail</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>El Amin</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Arab J Med</title>
+ <imprint>
+ <biblScope unit="volume">2</biblScope>
+ <biblScope unit="page" from="21" to="24" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21–24.</note>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <analytic>
+ <title level="a" type="main">Incidence of heart disease in children at NICVD</title>
+ <author>
+ <persName><forename type="first">K</forename><forename type="middle">U</forename><surname>Aziz</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="300" to="305" />
+ <date type="published" when="1984">1984</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300–305.</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m" type="main">The various manifestations of rheumatic fever as exemplified in childhood and early life</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">B</forename><surname>Cheadle</surname></persName>
+ </author>
+ <imprint>
+ <publisher>Smith and Co</publisher>
+ <biblScope unit="page">1889</biblScope>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889.</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-I. A major public health problem</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="336" to="345" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336–345.</note>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <analytic>
+ <title level="a" type="main">Prevalence of heart disease in school children of Islamabad</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Malik</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Jaffrey</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Ahmed</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">Zubeda</forename><surname>Khanum</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pakistan Heart Journal</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <biblScope unit="page" from="2" to="6" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2–6.</note>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease and overcrowding</title>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Watkins</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Quinn</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Am J Public Health</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="1071" to="1081" />
+ <date type="published" when="1948">1948</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071–1081.</note>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <analytic>
+ <title level="a" type="main">The spectrum and specter of rheumatic fever in 1980&apos;s</title>
+ <author>
+ <persName><forename type="first">W</forename><surname>El-Sadr</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Taranta</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Clinical Immunology Up-Date. Edited by Franklin EC</title>
+ <imprint>
+ <biblScope unit="page" from="183" to="203" />
+ <date type="published" when="1979">1979</date>
+ <publisher>Elsevier</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980&apos;s. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183–203.</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">Tonsillitis in adolescent, Bailliere Tendoll and Cox</title>
+ <author>
+ <persName><forename type="first">C</forename><surname>Haig-Brown</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1886">1886</date>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886.</note>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <analytic>
+ <title level="a" type="main">Studies on the transmission within the families of group A hemolytic streptococci</title>
+ <author>
+ <persName><forename type="first">L</forename><forename type="middle">I</forename><surname>Levine</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Chapman</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Guerra</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><surname>Cooper</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Krause</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">J Lab Clin Med</title>
+ <imprint>
+ <biblScope unit="volume">67</biblScope>
+ <biblScope unit="page" from="483" to="494" />
+ <date type="published" when="1966">1966</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483–494.</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="volume">32</biblScope>
+ <biblScope unit="page" from="18" to="25" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Strasser T . Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron. 1978; 32: 18–25.</note>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Brittanica: Book of year 1991</title>
+ <imprint>
+ <date type="published" when="1991">1991</date>
+ <publisher>Chicago</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Brittanica: Book of year 1991. Chicago, 1991.</note>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <monogr>
+ <title level="m" type="main">Pockets of rheumatic fever in developed world. XI World Congress of Cardiology</title>
+ <author>
+ <persName><forename type="first">R</forename><surname>Talbot</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990">1990</date>
+ <pubPlace>Manila</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990.</note>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease</title>
+ </analytic>
+ <monogr>
+ <title level="j">Circulation</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="A1" to="15" />
+ <date type="published" when="1970">1970</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1–15.</note>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever and rheumatic carditis in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Shafqat</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Ramzan</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="2" to="9" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2–9.</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in developing countries</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Padmavati</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">56</biblScope>
+ <biblScope unit="page" from="543" to="550" />
+ <date type="published" when="1979">1979</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543–550.</note>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">Streptococcal infections in families. Factors altering individual susceptibility</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Meyer</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Haggerty</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pediatrics</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="539" to="549" />
+ <date type="published" when="1962">1962</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539–549.</note>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Collagen and connective tissue diseases</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Shanks</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Textbook of Pediatrics</title>
+ <editor>
+ <persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Forfar</surname></persName>
+ <persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Arneil</surname></persName>
+ </editor>
+ <meeting><address><addrLine>Edinburgh</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="page" from="1501" to="1515" />
+ </imprint>
+ <respStmt>
+ <orgName>Churchill Livingstone</orgName>
+ </respStmt>
+ </monogr>
+ <note type="raw_reference">Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501–1515.</note>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Prophylaxis against recurrence of rheumatic fever</title>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Billoo</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">S</forename><surname>Abbasi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Sultana</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">L</forename><surname>Desa</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <biblScope unit="page" from="8" to="14" />
+ <date type="published" when="1968">1968</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8–14.</note>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="page" from="14" to="16" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14–16.</note>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="389" to="395" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389–395.</note>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever: Clinical profile of 339 cases with long term follow-up</title>
+ <author>
+ <persName><forename type="first">M</forename><forename type="middle">K</forename><surname>Joshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">P</forename><forename type="middle">W</forename><surname>Kandoth</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Barve</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Kamat</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Indian pediatr</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="849" to="853" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849–853.</note>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in rural south Indian children</title>
+ <author>
+ <persName><forename type="first">G</forename><surname>Koshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Benjamin</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><surname>Cherian</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="599" to="603" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599–603.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/nature_article.html b/python/tests/files/nature_article.html
new file mode 100644
index 0000000..177da83
--- /dev/null
+++ b/python/tests/files/nature_article.html
@@ -0,0 +1,1379 @@
+
+
+
+
+
+
+
+
+<!DOCTYPE html>
+<html lang="en" class="grade-c">
+<head>
+ <meta charset="utf-8">
+<link rel="dns-prefetch" href="//ajax.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.gstatic.com"/>
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
+
+ <title>More than 100 scientific journals have disappeared from the Internet</title>
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+ <meta property="og:type" content="article"/>
+ <meta property="og:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta property="og:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+ <meta name="twitter:card" content="summary_large_image"/>
+ <meta name="twitter:site" content="@nature"/>
+ <meta name="twitter:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta name="twitter:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta name="twitter:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+
+
+ <meta name="journal_id" content="41586"/>
+
+ <meta name="dc.title" content="More than 100 scientific journals have disappeared from the Internet"/>
+
+ <meta name="dc.source" content="Nature 2020"/>
+
+ <meta name="dc.format" content="text/html"/>
+
+ <meta name="dc.publisher" content="Nature Publishing Group"/>
+
+ <meta name="dc.date" content="2020-09-10"/>
+
+ <meta name="dc.type" content="News"/>
+
+ <meta name="dc.language" content="En"/>
+
+ <meta name="dc.copyright" content="2020 Nature"/>
+
+ <meta name="dc.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="dc.description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="prism.publicationName" content="Nature"/>
+
+ <meta name="prism.publicationDate" content="2020-09-10"/>
+
+ <meta name="prism.section" content="News"/>
+
+ <meta name="prism.startingPage" content=""/>
+
+ <meta name="prism.endingPage" content=""/>
+
+ <meta name="prism.copyright" content="2020 Nature"/>
+
+ <meta name="prism.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="prism.url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+
+ <meta name="prism.doi" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="dc.identifier" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="DOI" content="10.1038/d41586-020-02610-z"/>
+
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="dc.creator" content="Diana Kwon"/>
+
+ <meta name="dc.subject" content="Publishing"/>
+
+
+
+<script>(function(e){var t=e.documentElement,n=e.implementation;t.className='js';if(n&&n.hasFeature('http://www.w3.org/TR/SVG11/feature#Image','1.1')){t.className+=' svg'}})(document)</script>
+<link rel="stylesheet" href="/static/css/mosaic-grade-c.26f07b2f11.css">
+
+<link rel="stylesheet" class="js-ctm" href="/static/css/magazine-mosaic-150.7f46c29843.css" media="only screen, print and (-webkit-min-device-pixel-ratio:0) and (min-color-index:0), (-ms-high-contrast: none), only all and (min--moz-device-pixel-ratio:0) and (min-resolution: 3e1dpcm)">
+
+
+ <style>
+ .c-header--brand-border {
+ border-bottom: 5px solid #000;
+ }
+ </style>
+
+<link rel="apple-touch-icon" sizes="180x180" href=/static/images/favicons/nature/apple-touch-icon.f39cb19454.png>
+<link rel="icon" type="image/png" sizes="32x32" href=/static/images/favicons/nature/favicon-32x32.3fe59ece92.png>
+<link rel="icon" type="image/png" sizes="16x16" href=/static/images/favicons/nature/favicon-16x16.951651ab72.png>
+<link rel="manifest" href=/static/manifest.1a481c42b1.json>
+<link rel="mask-icon" href=/static/images/favicons/nature/safari-pinned-tab.69bff48fe6.svg color="#000000">
+<link rel="shortcut icon" href=/static/images/favicons/nature/favicon.62367f778b.ico>
+<meta name="msapplication-TileColor" content="#000000">
+<meta name="msapplication-config" content=/static/browserconfig.e35b3b052c.xml>
+<meta name="theme-color" content="#000000">
+<meta name="application-name" content="Nature">
+
+<link rel="search" href="http://www.nature.com/search">
+<link rel="search" href="http://www.nature.com/opensearch/opensearch.xml" type="application/opensearchdescription+xml" title="nature.com">
+<link rel="search" href="http://www.nature.com/opensearch/request" type="application/sru+xml" title="nature.com">
+
+ <meta name="WT.cg_s" content="News"/>
+ <meta name="WT.z_cg_type" content="News"/>
+ <meta name="WT.page_categorisation" content="Article page"/>
+ <meta name="WT.z_subject_term" content="Publishing"/>
+
+<meta name="WT.template" content="oscar"/>
+<meta name="WT.cg_n" content="Nature"/>
+<meta name="dc.rights" content="©2020 Macmillan Publishers Limited. All Rights Reserved."/>
+<meta name="WT.z_bandiera_abtest" content="a"/>
+
+ <script data-test="dataLayer">
+ dataLayer = [{"content":{"category":{"contentType":"news","legacy":{"webtrendsPrimaryArticleType":"news","webtrendsSubjectTerms":"publishing","webtrendsContentCategory":null,"webtrendsContentCollection":null,"webtrendsContentGroup":"Nature","webtrendsContentGroupType":null,"webtrendsContentSubGroup":"News"}},"article":{"doi":"10.1038/d41586-020-02610-z"},"attributes":{"cms":"core media","deliveryPlatform":"oscar","copyright":{"open":false,"legacy":{"webtrendsLicenceType":null}}},"contentInfo":{"authors":["Diana Kwon"],"publishedAt":1599696000,"publishedAtString":"2020-09-10","title":"More than 100 scientific journals have disappeared from the Internet","legacy":null,"publishedAtTime":null,"documentType":"aplusplus"},"journal":{"pcode":"nature","title":"nature","volume":null,"issue":null},"authorization":{"status":true},"features":[{"name":"furtherReadingSection","present":false}],"collection":null},"page":{"category":{"pageType":"article"},"attributes":{"template":"magazine mosaic","featureFlags":[{"name":"ab_test_news_feature","active":false}]},"search":null},"privacy":{},"version":"1.0.0","product":null,"session":null,"user":null,"backHalfContent":false}];
+</script>
+
+<script>
+ (function() {
+ function deleteCookie (name, domain) {
+ document.cookie = encodeURIComponent(name) +
+ '=' +
+ ';path=/' +
+ ';domain=' + domain +
+ ';expires=Thu, 01 Jan 1970 00:00:00 GMT';
+ }
+
+ var consentCookieParts = ('; ' + document.cookie).split('; OptanonConsent=');
+
+ if (consentCookieParts.length > 1) {
+ consentCookieParts.shift(); // remove redundant first part from the split array
+
+ // onetrust can set the same cookie multiple times with different domain specificities
+ for (let i=0; i<consentCookieParts.length; i++) {
+ var otCookieGroups = consentCookieParts[i].split('&groups=').pop().split('&').shift();
+
+ if (otCookieGroups.indexOf('C0001') === -1) {
+ deleteCookie('OptanonConsent', 'nature.com');
+ deleteCookie('OptanonAlertBoxClosed', 'nature.com');
+ }
+ }
+ }
+ })();
+</script>
+
+<script>
+ (function(w,d,t) {
+ function cc() {
+ var h = w.location.hostname;
+ if (h.indexOf('preview-www.nature.com') > -1) return;
+
+ var e = d.createElement(t),
+ s = d.getElementsByTagName(t)[0];
+
+ if (h.indexOf('nature.com') > -1) {
+ e.src = 'https://cdn.cookielaw.org/scripttemplates/otSDKStub.js';
+ e.setAttribute('data-domain-script', '83f2c78a-6cbc-4d1a-9088-3f8e8c4c7460');
+ } else {
+ e.src = '/static/js/cookie-consent-bundle.9d49adbc02.js';
+ e.setAttribute('data-consent', h);
+ }
+ s.parentNode.insertBefore(e, s);
+ }
+
+ !!w.google_tag_manager ? cc() : window.addEventListener('gtm_loaded', function() {cc()});
+ })(window,document,'script');
+</script>
+<script>
+ function OptanonWrapper() {
+ window.dataLayer.push({event:'OneTrustGroupsUpdated'});
+ document.activeElement.blur();
+ }
+</script>
+
+
+<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+ new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+ 'https://www.googletagmanager.com/gtm.js?id='+i+dl;
+
+
+ j.addEventListener('load', function() {
+ var _ge = new CustomEvent('gtm_loaded', { bubbles: true });
+ d.dispatchEvent(_ge);
+ });
+
+ f.parentNode.insertBefore(j,f);
+})(window,document,'script','dataLayer','GTM-NWDMT9Q');</script>
+
+
+
+</head>
+<body>
+
+
+
+<div role="banner" class="position-relative cleared z-index-50 background-white" data-test="top-containers">
+
+
+ <a class="c-skip-link u-hide-print" href="#content">Skip to main content</a>
+
+
+
+
+
+
+
+ <aside class="c-ad c-ad--728x90">
+ <div class="c-ad__inner" data-container-type="banner-advert">
+ <p class="c-ad__label">Advertisement</p>
+
+
+
+ <div id="article-doubleclickad-container">
+ <div id="div-gpt-ad-top-1"
+ class="div-gpt-ad advert leaderboard js-ad text-center hide-print grade-c-hide"
+ data-ad-type="top"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="728x90"
+ data-gpt-targeting="type=article;pos=top;artid=d41586-020-02610-z;doi=10.1038/d41586-020-02610-z;subjmeta=479,648,706;kwrd=Publishing">
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing"
+ alt="Advertisement"
+ width="728"
+ height="90"></a>
+ </noscript>
+ </div>
+</div>
+
+
+
+
+ </div>
+ </aside>
+
+
+
+
+
+ <div class="c-grade-c-banner u-hide">
+ <div class="c-grade-c-banner__container">
+
+ <p>Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain
+ the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in
+ Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles
+ and JavaScript.</p>
+
+ </div>
+ </div>
+
+
+
+
+ <header class="c-header c-header--brand-border" id="header" data-header>
+ <div class="c-header__row-border">
+ <div class="c-header__container">
+ <div class="c-header__layout">
+ <a href="/nature"
+ data-track="click" data-track-action="home" data-track-category="nature-150-split-header" data-track-label="image">
+ <picture class="c-header__logo">
+ <source srcset="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" media="(min-width: 769px)">
+ <img src="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" alt="Nature">
+ </picture>
+ </a>
+ <div class="c-header__layout">
+
+ <div class="c-header__site-navigation c-header__site-navigation--show-at-md"
+ data-test="siteindex-link">
+ <a class="c-header__link" href="https://www.nature.com/siteindex"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open nature research index" data-track-label="link">
+ <span>View all Nature Research journals</span>
+ </a>
+ </div>
+
+ <div class="c-header__site-navigation c-header__site-navigation--border">
+ <a class="c-header__link"
+ href="#search-menu"
+ data-header-expander
+ data-test="search-link" data-track="click" data-track-category="nature-150-split-header" data-track-action="open search tray" data-track-label="button">
+ <span>Search</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M16.48 15.455c.283.282.29.749.007 1.032a.738.738 0 01-1.032-.007l-3.045-3.044a7 7 0 111.026-1.026zM8 14A6 6 0 108 2a6 6 0 000 12z"/></svg>
+ </a>
+ <a href="/nams/svc/myaccount"
+ id="my-account"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="my account" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>My Account</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+<a href="https://idp.nature.com/authorize/natureuser?client_id&#x3D;grover&amp;redirect_uri&#x3D;https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z"
+ id="login-button"
+ style="display: none;"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="login" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>Login</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="c-header__container" data-test="c-header__container">
+ <ul class="c-header__menu">
+
+ <li class="c-header__item" data-test="explore-content-button">
+ <a href="#explore"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open explore expander" data-track-label="button">
+ <span>Explore <span class="c-header__show-text">our content</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item">
+ <a href="#journal-info"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open journal information expander" data-track-label="button">
+ <span>Journal info<span class="c-header__show-text">rmation</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item c-header__item--pipe">
+ <a class="c-header__link"
+ href="https://www.nature.com/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-category="nature-150-split-header"
+ data-track-label="link">
+ <span>Subscribe</span>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+
+ </header>
+
+
+
+
+ <div class="u-mb-16">
+ <div class="u-container">
+ <ol class="c-breadcrumbs">
+ <li class="c-breadcrumbs__item" id="breadcrumb0"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb1"><a class="c-breadcrumbs__link"
+ href="/"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:nature"><span itemprop="title">nature</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb1"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb2"><a class="c-breadcrumbs__link"
+ href="/nature/articles?type&#x3D;news"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:news"><span itemprop="title">news</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb2"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb3"><span itemprop="title">article</span></li>
+ </ol>
+ </div>
+ </div>
+
+
+
+
+
+
+</div>
+
+
+ <div id="content" class="article-page position-relative z-index-1">
+ <section class="container highlight-container article-page--news container-with-gap">
+ <article class="article-item article-item--open" itemscope="" itemtype="http://schema.org/NewsArticle"
+ data-track-component="news">
+ <div class="container cleared container-type-article" data-container-type="article" itemprop="articleBody">
+ <div class="content position-relative cleared clear mq1200-padded" data-component="article-container"
+ role="main">
+ <header class="article-item__header clear cleared pull--both">
+ <div class="article__type">NEWS
+ <div class="ml10 article__date">
+ <time itemprop="datePublished">10 September 2020</time>
+ </div>
+ </div>
+
+ <div class="clear cleared"></div>
+ <h1 class="article-item__title serif" itemprop="headline">More than 100 scientific journals have disappeared from the Internet</h1>
+
+ <div class="article-item__teaser-text serif">
+ Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk.
+ </div>
+ </header>
+
+ <div class="clear cleared"></div>
+
+ <div class="bordered-container clear cleared pull--both">
+ <div id="author-affiliations" class="tab-group text14" role="tablist" data-test="author-affiliations" data-tab-group>
+ <div class="cleared">
+
+ <div id="author-affiliation-news-0" class="tab-box js-box-wrapper">
+ <h3 id="author-affiliation-news-0-head" data-track="click" data-track-label="view author info" class="sans-serif strong tab tab-skin ma0" role="tab"
+ aria-controls="author-affiliation-news-0-content" data-tooltip="Show author information">
+ Diana Kwon
+ </h3>
+ <div id="author-affiliation-news-0-content" class="tab-content pin-right grid grid-12 last"
+ role="tabpanel">
+ <div class="pa10" aria-labelledby="author-affiliation-news-0-head">
+ <div class="clear cleared">
+
+
+ <div class="align-left">
+ <h4 class="sans-serif">Search for this author in:</h4>
+ <ul class="ma0 clean-list">
+ <li class="strong"><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd&#x3D;search&amp;term&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Pub Med" >Pub Med</a></li>
+
+ <li class="strong"><a href="https://www.nature.com/search?order&#x3D;date_desc&amp;q&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Nature.com" >Nature.com</a></li>
+
+ <li class="strong"><a href="https://scholar.google.co.uk/scholar?as_q&#x3D;&amp;btnG&#x3D;Search+Scholar&amp;as_sauthors&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Google Scholar" >Google Scholar</a></li>
+ </ul>
+ </div>
+
+
+
+ </div>
+ </div>
+ </div>
+ </div>
+
+ </div>
+</div>
+
+ </div>
+
+ <div class="clear cleared pull--both">
+ <ul class="social clean-list inline-list hide-print">
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="twitter" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="https://twitter.com/intent/tweet?text=More+than+100+scientific+journals+have+disappeared+from+the+Internet&url=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Twitter</title>
+ <desc>Share on Twitter</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M20.8125,11.4875 C21.42,11.10375 21.8875,10.49625 22.105,9.7725 C21.5375,10.1275 20.90875,10.385 20.23875,10.5225 C19.70625,9.9225 18.9425,9.545 18.0975,9.545 C16.475,9.545 15.16,10.9325 15.16,12.6425 C15.16,12.885 15.185,13.1225 15.235,13.3475 C12.7975,13.2175 10.63125,11.985 9.1825,10.11 C8.93,10.56875 8.785,11.10125 8.785,11.66875 C8.785,12.74375 9.30375,13.69125 10.09125,14.2475 C9.61125,14.23125 9.1575,14.09 8.76125,13.86 L8.76125,13.8975 C8.76125,15.3975 9.77375,16.65125 11.11875,16.935 C10.87125,17.0075 10.6125,17.04375 10.34375,17.04375 C10.15625,17.04375 9.96875,17.025 9.79125,16.98875 C10.16625,18.22125 11.24875,19.11875 12.535,19.1425 C11.52875,19.97375 10.2625,20.4675 8.885,20.4675 C8.6475,20.4675 8.415,20.455 8.185,20.42625 C9.485,21.30375 11.02875,21.81625 12.6875,21.81625 C18.09,21.81625 21.04375,17.095 21.04375,13.00125 L21.03625,12.60125 C21.61125,12.16375 22.11125,11.6175 22.50125,10.99625 C21.97375,11.2425 21.4075,11.40875 20.81375,11.48375 L20.8125,11.4875 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="facebook" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Facebook</title>
+ <desc>Share on Facebook</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15.89625,22.8625 L12.57125,22.8625 L12.57125,15.02125 L10.90875,15.02125 L10.90875,12.31875 L12.57125,12.31875 L12.57125,10.69625 C12.57125,8.4925 13.50875,7.18 16.175,7.18 L18.39375,7.18 L18.39375,9.8825 L17.00625,9.8825 C15.96875,9.8825 15.9,10.26 15.9,10.965 L15.895,12.3175 L18.4075,12.3175 L18.115,15.02 L15.89625,15.02 L15.89625,22.8625 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="email" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="mailto:?subject=More than 100 scientific journals have disappeared from the Internet&body=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share via E-Mail</title>
+ <desc>Share via E-Mail</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15,15.3269887 L10.6248577,11.9177869 C10.4236021,11.7609644 10.1299323,11.7927468 9.96892789,11.988775 C9.80792343,12.1848031 9.84055341,12.4708451 10.041809,12.6276676 L14.7012493,16.2584003 C14.8680779,16.3940555 15.1152493,16.4013884 15.2915244,16.2640313 C15.2939898,16.2622325 15.2963784,16.2603294 15.2987507,16.2584003 L19.958191,12.6276676 C20.1594466,12.4708451 20.1920766,12.1848031 20.0310721,11.988775 C19.8700677,11.7927468 19.5763979,11.7609644 19.3751423,11.9177869 L15,15.3269887 Z M9,10 L21,10 C21.5522847,10 22,10.4477153 22,11 L22,19 C22,19.5522847 21.5522847,20 21,20 L9,20 C8.44771525,20 8,19.5522847 8,19 L8,11 C8,10.4477153 8.44771525,10 9,10 Z"></path>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+</ul>
+
+ </div>
+
+
+
+
+ <div class="align-left">
+
+ <div class="article__body serif cleared">
+ <p>Scholarly journals are supposed to provide a lasting record of science. But over the past two decades, 176 open-access journals — and many of the papers published in them — have disappeared from the Internet, according to an analysis published on 27 August<sup><a href="#ref-CR1" data-track="click" data-action="anchor-link" data-track-label="go to reference" data-track-category="references">1</a></sup>.</p><p>“There shouldn’t really be any decay or loss in scientific publications, particularly those that have been open on the web,†says Mikael Laakso, an information scientist at the Hanken School of Economics in Helsinki, and a co-author of the study, which was posted on the arXiv preprint server. He and his colleagues identified 176 titles whose online presence vanished between 2000 and 2019.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"><h1 class="recommended__title serif">Investigating journals: The dark side of publishing</h1></a>
+ </aside></p><p>More than half of these journals were in the social sciences and humanities, although life sciences, health sciences, physical sciences and mathematics were also represented. Eighty-eight of the journals were affiliated with a scholarly society or a research institution. The analysis also identified 900 journals that are still online but seem to have stopped publishing papers, so might be vulnerable to vanishing in the near future.</p><p>The study lays out a "compelling case" for the vulnerability of online journals, says Elizabeth Lightfoot, a librarian at Florida International University in Miami.</p><h2>Vanishing journals</h2><p>Journals can disappear from the Internet for a number of reasons, says Laakso. The publisher might stop paying to keep its publication’s webpage afloat, for example, or journals might be hosted on an online platform that belongs to an academic institution and is left behind when the site or server is updated.</p><p>Journals are supposed to be preserved in digital archives when this happens. Services such as the LOCKSS (Lots of Copies Keep Stuff Safe) Program, which was launched by Stanford Libraries in 1999, aim to ensure that publications remain available even when the publisher is no longer around. LOCKSS works by making multiple copies of content that is stored on the servers of participating libraries, who pay an annual fee to have their collections preserved. Similar initiatives, including CLOCKSS, Portico and the Public Knowledge Project’s Preservation Network (PKP PN), have emerged over the past two decades. These vary in cost and coverage: Some work with libraries, others with publishers — services such as PKP PN are free for journals that sign up. Tens of thousands of titles are currently curated in such preservation schemes. But, Laakso says, there are dozens of journals that fall through the cracks.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"><h1 class="recommended__title serif">Radical open-access plan could spell end to journal subscriptions</h1></a>
+ </aside></p><p>Pinning down whether a journal is truly unavailable online is a challenge, because there is no single database that tracks the activity of open-access journals, says Lisa Matthias, one of the authors of the study and a PhD student at the Free University of Berlin. Databases such as the Directory of Open Access Journals (DOAJ) don’t keep track of journals that no longer publish — and journals that cease publishing or stop maintaining their presence on the web usually do so silently.</p><p>To find out how many journals had vanished, the team manually collected historical data from several lists of titles, including the DOAJ, Ulrichsweb and Scopus. Then they checked to see if any of the titles they identified were listed on the Keepers Registry, which keeps track of journals that are enrolled into digital preservation schemes. Finally, they went to the Internet Archive’s Wayback Machine to access snapshots of now-offline journals’ websites to see when they had last published, and when the content was last available on the Internet. Journals were considered “vanished†if less than 50% of their content was still freely available online (the researchers acknowledge that some journals could exist in print form or behind a paywall).</p><p>The majority of the 176 vanished journals had disappeared within 5 years of becoming inactive — the point at which they stopped publishing papers. Around one-third of them disappeared within one year of the last publication. The researchers used this ‘life cycle’ to estimate that another 900 inactive open-access journalscould be at risk of vanishing.</p><h2>Preserving the literature</h2><p>Subscription journals were not included in the study, Laakso says, because paywalls mean that they would have had to have used a different method to collect the data. He adds that because of this and other limitations, the study probably underestimates the number of journals that have disappeared. “It’s really hard to pin down when something doesn't absolutely exist, but we tried our best,†Laakso says. “We hope that there will be more refined and automatic ways to detect these in the future.â€</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-019-02038-0" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16870448.jpg"><h1 class="recommended__title serif">India culls hundreds more ‘dubious’ journals from government approved list</h1></a>
+ </aside></p><p>Thib Guicherd-Callin, the acting manager of the LOCKSS Program, says it’s not surprising that there are journals that aren't captured by existing preservation services. Although many groups have used the open-source LOCKSS software, efforts to launch digital preservation initiatives are still “woefully underfundedâ€, he adds. “The desire to preserve these at-risk works is there,†he adds, but few institutions are investing the resources necessary to identify these publications and make sure they’re included in a digital preservation scheme.</p><p>Matthias says that the responsibility for ensuring inactive journals don’t disappear should be shared between publishers, authors, librarians and preservation services. Lightfoot agrees that a coordinated and collaborative effort is necessary. However, she adds, “the twin challenges of what that effort might look like and who would fund it make the pathway forward murky at bestâ€.</p>
+ </div>
+
+ <div class="emphasis">doi: <a href="https://doi.org/10.1038/d41586-020-02610-z">https://doi.org/10.1038/d41586-020-02610-z</a></div>
+ <div class="anchor-link mt40" data-toggle="anchor-links"></div>
+ <div id="references" class="references" data-toggle="anchor-links-section" data-label="References" data-concertina="true">
+ <section aria-labelledby="Bib1"><div class="serif article-section js-article-section cleared clear" id="Bib1-section"><h2 class="js-section-title section-title strong position-relative tighten-line-height background-gray-light pt20 pb6 pl0 pr20 standard-space-below small-space-above mq640-pt10 mq640-pb10 mq640-pl20 mq640-mt0 mq640-ml-20 mq640-mr-20 extend-left" id="Bib1">References</h2><div class="pl20 mq875-pl0 js-collapsible-section" id="Bib1-content"><div data-container-section="references"><ol class="clean-list ma0 standard-space-below indented-list" data-test="references-list"><li class="small-space-below border-gray-medium border-bottom-1 position-relative js-ref-item" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Article" data-test="citation"><span class="indented-counter serif h2 tighten-line-height text-right position-absolute grade-c-hide">1.</span><p class="tiny-space-below" id="ref-CR1">Laakso, M., Matthias, L. &amp; Jahn, N. Preprint at <a href="https://arxiv.org/abs/2008.11933">https://arxiv.org/abs/2008.11933</a> (2020).</p><ul class="js-ref-links clean-list cleared strong sans-serif text13 hide-print small-space-below"><li class="pin-right"><ul class="clean-list ma0"></ul></li></ul></li></ol><p class="hide-print text-right"><a href="/articles/d41586-020-02610-z-references.ris" class="text14 sans-serif strong" data-track="click" data-track-action="download citation references" data-track-label="link">Download references</a></p></div></div></div></section>
+ </div>
+
+
+
+
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="inPage box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-inPage-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-inPage">
+ <input id="briefing-box-signup-form-inPage-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-inPage-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-inPage-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-inPage-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-inPage-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+
+
+ </div>
+
+ <aside class="article__aside align-right">
+ <div class="related-content shrink--aside hide-print">
+
+ <h3 class="aside__title sans-serif">Related Articles</h3>
+ <ul class="ma0 clean-list">
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click"
+ data-track-label="related article (rank:0)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ </noscript>
+
+ Radical open-access plan could spell end to journal subscriptions
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click"
+ data-track-label="related article (rank:1)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ </noscript>
+
+ Investigating journals: The dark side of publishing
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-020-01066-5" data-track="click"
+ data-track-label="related article (rank:2)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ </noscript>
+
+ Nature to join open-access Plan S, publisher says
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07557-w" data-track="click"
+ data-track-label="related article (rank:3)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ </noscript>
+
+ Funders flesh out details of Europe’s bold open-access plan
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07245-9" data-track="click"
+ data-track-label="related article (rank:4)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ </noscript>
+
+ AI peer reviewers unleashed to ease publishing grind
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/open-access-the-true-cost-of-science-publishing-1.12676" data-track="click"
+ data-track-label="related article (rank:5)">
+
+ The true cost of science publishing
+ </a>
+ </h3>
+ </li>
+
+ </ul>
+ </div>
+
+ <div class="article__subjects bordered-container shrink--aside hide-print">
+ <h3 class="aside__title sans-serif">Subjects</h3>
+ <ul class="ma0 subject-list cleared clean-list inline-list">
+
+ <li class="subject"><a href="/subjects/publishing" data-track="click"
+ data-track-label="subject (rank:0)">Publishing</a>
+ </li>
+
+ </ul>
+ </div>
+
+
+
+<div id="div-gpt-ad-right-2"
+ class="div-gpt-ad medium-rectangle advert js-ad text-center hide-print grade-c-hide"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="300x250"
+ data-gpt-targeting="pos=right;artid=/articles/d41586-020-02610-z;path=/articles/d41586-020-02610-z"
+ data-ad-type="right"
+ >
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z"
+ alt="Advertisement"
+ width="300"
+ height="250"/>
+ </a>
+ </noscript>
+</div>
+
+
+ <div class="nature-briefing--sidebar bordered-container shrink--aside hide-print">
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="sidebar box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Sign up to Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-sidebar-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-sidebar">
+ <input id="briefing-box-signup-form-sidebar-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-sidebar-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-sidebar-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-sidebar-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-sidebar-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+</div>
+
+ </aside>
+ </div>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="publisher" itemtype="https://schema.org/Organization">
+ <meta content="Macmillan Publishers Limited, part of Springer Nature" itemprop="name"/>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="author" itemtype="https://schema.org/Organization">
+ <meta content="Nature Editorial" itemprop="name"/>
+ </div>
+ <img src="/platform/track/article/d41586-020-02610-z" width="1" height="1" alt="" class="visually-hidden"/>
+</article>
+
+
+
+
+
+
+
+<div class="c-site-messages message hide u-hide-print c-site-messages--nature-briefing c-site-messages--nature-briefing-email-variant c-site-messages--nature-briefing-redesign-2020 sans-serif"
+data-component-id="nature-briefing-banner"
+data-component-expirydays="30"
+data-component-trigger-scroll-percentage="15"
+data-track="in-view"
+data-track-action="in-view"
+data-track-category="nature briefing"
+data-track-label="redesign banner visible">
+
+
+ <div class="c-site-messages__banner-large">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__form-container">
+
+
+
+ <div class="grid grid-12 last">
+ <div class="grid grid-4">
+ <img alt="Nature Briefing" src="/static/images/logos/nature-briefing-logo-n150-white.d81c9da3ec.svg" width="250" height="40">
+ <p class="c-site-messages--nature-briefing__strapline extra-tight-line-height">Sign up for the <em>Nature Briefing</em> newsletter — what matters in science, free to your inbox daily.</p>
+ </div>
+ <div class="grid grid-8 last">
+ <form action="/briefing/signup/formfeedback" method="post" data-location="banner" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-banner-signup-form-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBannerRedesign2020">
+ <input id="briefing-banner-signup-form-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBanner">
+ <label class="nature-briefing-banner__email-label" for="banner-EmailAddressInput">Email address</label>
+
+ <div class="nature-briefing-banner__email-wrapper">
+ <input class="nature-briefing-banner__email-input box-sizing text14" type="email" id="banner-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-emailbanner-email-input">
+ <button type="submit" class="nature-briefing-banner__submit-button box-sizing text14" data-test-element="briefing-emailbanner-signup-button">Sign up</button>
+ </div>
+
+ <div class="nature-briefing-banner__checkbox-wrapper grid grid-12 last">
+ <input class="nature-briefing-banner__checkbox-checkbox" id="gdpr-briefing-banner-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-emailbanner-gdpr-checkbox" required>
+ <label class="nature-briefing-banner__checkbox-label box-sizing text13 sans-serif block tighten-line-height" for="gdpr-briefing-banner-checkbox">I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+ </form>
+ </div>
+ </div>
+
+
+ </div>
+
+ </div>
+
+
+ <div class="c-site-messages__banner-small">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__content text14">
+ <span class="c-site-messages--nature-briefing__strapline strong serif">Get the most important science stories of the day, free in your inbox.</span>
+ <a class="nature-briefing__link text14 sans-serif"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner CTA to site"
+ data-test-element="briefing-banner-link"
+ target="_blank"
+ rel="noreferrer noopener"
+ href="/briefing/signup/?origin=Nature&amp;originReferralPoint=EmailBanner">Sign up for Nature Briefing
+ </a>
+ </div>
+
+ </div>
+
+</div>
+
+ </section>
+</div>
+ <script>
+ window.onload = function () {
+ Array.prototype.slice.call(document.querySelectorAll(".magazine-infographic > iframe"))
+ .forEach(function (element) {
+ function listener(event) {
+ if (event.data.height) {
+ if (element.id === event.data.requestData.id) {
+ element.setAttribute("height", event.data.height)
+ }
+ }
+ }
+
+ window.addEventListener("message", listener);
+ element.contentWindow.postMessage({name: "getHeight", id: element.id}, "*");
+ });
+ }
+ </script>
+ <script>
+ var linkEl = document.querySelector('.js-ctm');
+ if (linkEl && window.matchMedia && window.matchMedia(linkEl.media).matches) {
+ var fragment = document.createDocumentFragment();
+ var polyfillScript = document.createElement('script');
+ var header150Script = null;
+ var appScript = document.createElement('script');
+ var sharedEs6Script = document.createElement('script');
+
+ polyfillScript.src = 'https://cdn.polyfill.io/v2/polyfill.min.js?features=default,IntersectionObserver,Array.prototype.includes,Promise';
+ polyfillScript.async = false;
+ fragment.appendChild(polyfillScript);
+
+ appScript.src = '/static/js/magazine/magazine-mosaic.71d8740808.js';
+ appScript.async = false;
+ fragment.appendChild(appScript);
+
+ sharedEs6Script.src = '/static/js/shared-es6-bundle.c83ed51f05.js';
+ sharedEs6Script.async = false;
+ fragment.appendChild(sharedEs6Script);
+
+ header150Script = document.createElement('script');
+ header150Script.src = '/static/js/header-150-bundle.aaea96385f.js';
+ header150Script.async = false;
+ fragment.appendChild(header150Script);
+
+ document.body.appendChild(fragment);
+ }
+ </script>
+ <script>
+ var idp = {
+ hasNatureUserProof: function (hasProof) {
+ if (!hasProof) {
+ document.getElementById("my-account").setAttribute("style", "display: none;");
+ document.getElementById("login-button").setAttribute("style", "");
+ }
+ }
+ }
+ </script>
+ <script src="https://verify.nature.com/verify/nature.min.js"></script>
+ <noscript>
+ <img src="https://verify.nature.com/verify/nature.png" alt="" width="0" height="0"/>
+ </noscript>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Explore-our-content" data-test="Explore-our-content" id="explore" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Explore-our-content" class="c-header-expander__heading u-js-hide">Explore our content</h2>
+ <ul class="c-header-expander__list">
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/research"
+ data-track="click"
+ data-track-action="research"
+ data-track-label="link">
+ Research
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/news"
+ data-track="click"
+ data-track-action="news"
+ data-track-label="link">
+ News
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/opinion"
+ data-track="click"
+ data-track-action="opinion"
+ data-track-label="link">
+ Opinion
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/research-analysis"
+ data-track="click"
+ data-track-action="research analysis"
+ data-track-label="link">
+ Research Analysis
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/careers"
+ data-track="click"
+ data-track-action="careers"
+ data-track-label="link">
+ Careers
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/books-culture"
+ data-track="click"
+ data-track-action="books and culture"
+ data-track-label="link">
+ Books and Culture
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/podcast"
+ data-track="click"
+ data-track-action="podcasts"
+ data-track-label="link">
+ Podcasts
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/videoarchive"
+ data-track="click"
+ data-track-action="videos"
+ data-track-label="link">
+ Videos
+ </a>
+ </li>
+
+
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/current-issue"
+ data-track="click"
+ data-track-action="current issue"
+ data-track-label="link">
+ Current Issue
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-issues"
+ data-track="click"
+ data-track-action="browse issues"
+ data-track-label="link">
+ Browse Issues
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/articles"
+ data-track="click"
+ data-track-action="browse articles"
+ data-track-label="link">
+ Browse Articles
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/collections"
+ data-track="click"
+ data-track-action="browse collections"
+ data-track-label="link">
+ Browse Collections
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-subjects"
+ data-track="click"
+ data-track-action="browse subjects"
+ data-track-label="link">
+ Browse Subjects
+ </a>
+ </li>
+
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="https://www.nature.com/my-account/alerts/subscribe-journal?list-id&#x3D;1"
+ data-track="click"
+ data-track-action="Sign up for alerts"
+ data-track-label="link">Sign up for alerts<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m4 10h2.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-3.08578644l-1.12132034 1.1213203c-.18753638.1875364-.29289322.4418903-.29289322.7071068v.1715729h14v-.1715729c0-.2652165-.1053568-.5195704-.2928932-.7071068l-1.7071068-1.7071067v-3.4142136c0-2.76142375-2.2385763-5-5-5-2.76142375 0-5 2.23857625-5 5zm3 4c0 1.1045695.8954305 2 2 2s2-.8954305 2-2zm-5 0c-.55228475 0-1-.4477153-1-1v-.1715729c0-.530433.21071368-1.0391408.58578644-1.4142135l1.41421356-1.4142136v-3c0-3.3137085 2.6862915-6 6-6s6 2.6862915 6 6v3l1.4142136 1.4142136c.3750727.3750727.5857864.8837805.5857864 1.4142135v.1715729c0 .5522847-.4477153 1-1 1h-4c0 1.6568542-1.3431458 3-3 3-1.65685425 0-3-1.3431458-3-3z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Journal-information" id="journal-info" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Journal-information" class="c-header-expander__heading u-js-hide">Journal information</h2>
+ <ul class="c-header-expander__list">
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/about"
+ data-track="click"
+ data-track-action="about the journal"
+ data-track-label="link">
+ About the Journal
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-authors"
+ data-track="click"
+ data-track-action="for authors"
+ data-track-label="link">
+ For Authors
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-referees"
+ data-track="click"
+ data-track-action="for referees"
+ data-track-label="link">
+ For Referees
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/awards"
+ data-track="click"
+ data-track-action="awards"
+ data-track-label="link">
+ Awards
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-label="link">
+ Subscribe
+ </a>
+ </li>
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="http://mts-nature.nature.com/"
+ data-track="click"
+ data-track-action="Submit manuscript"
+ data-track-label="link">Submit manuscript<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m15 0c1.1045695 0 2 .8954305 2 2v5.5c0 .27614237-.2238576.5-.5.5s-.5-.22385763-.5-.5v-5.5c0-.51283584-.3860402-.93550716-.8833789-.99327227l-.1166211-.00672773h-9v3c0 1.1045695-.8954305 2-2 2h-3v10c0 .5128358.38604019.9355072.88337887.9932723l.11662113.0067277h7.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-7.5c-1.1045695 0-2-.8954305-2-2v-10.17157288c0-.53043297.21071368-1.0391408.58578644-1.41421356l3.82842712-3.82842712c.37507276-.37507276.88378059-.58578644 1.41421356-.58578644zm-.5442863 8.18867991 3.3545404 3.35454039c.2508994.2508994.2538696.6596433.0035959.909917-.2429543.2429542-.6561449.2462671-.9065387-.0089489l-2.2609825-2.3045251.0010427 7.2231989c0 .3569916-.2898381.6371378-.6473715.6371378-.3470771 0-.6473715-.2852563-.6473715-.6371378l-.0010428-7.2231995-2.2611222 2.3046654c-.2531661.2580415-.6562868.2592444-.9065605.0089707-.24295423-.2429542-.24865597-.6576651.0036132-.9099343l3.3546673-3.35466731c.2509089-.25090888.6612706-.25227691.9135302-.00001728zm-.9557137-3.18867991c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5zm-8.5-3.587-3.587 3.587h2.587c.55228475 0 1-.44771525 1-1zm8.5 1.587c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+
+
+ <div id="search-menu" class="c-header-expander c-header-expander--tray u-hide-print" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <h2 class="u-visually-hidden">Search</h2>
+ <div data-test="inline-search">
+ <div class="c-header-expander__keyline u-mb-16">
+ <form action="/search"
+ method="get"
+ role="search"
+ class="c-header-expander__form"
+ autocomplete="off"
+ data-dynamic-track-label
+ data-track="submit" data-track-action="search" data-track-label="form">
+ <label class="c-header-expander__heading" for="keywords">Article Search</label>
+ <div class="c-form-field u-display-flex">
+ <input type="text"
+ class="c-form-field__input u-flex-shrink"
+ id="keywords"
+ name="q"
+ value=""
+ placeholder="Search by keywords or author"
+ data-test="search-keywords">
+ <button type="submit" class="c-button c-button--contrast u-flex-static u-ml-8" data-test="search-submit">Search</button>
+ </div>
+ <p class="u-ma-0">
+ <a href="/search/advanced"
+ data-track="click" data-track-action="advanced search" data-track-label="link">
+ Advanced search
+ </a>
+ </p>
+ </form>
+ </div>
+ <div class="c-header-expander__keyline">
+ <h3 class="c-header-expander__heading">Quick links</h3>
+ <ul class="u-list-reset">
+ <li class="u-display-inline-block u-mr-24"><a href="/subjects" data-track="click" data-track-action="explore articles by subject" data-track-label="link">Explore articles by subject</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/naturecareers" data-track="click" data-track-action="find a job" data-track-label="link">Find a job</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to authors</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+
+
+
+
+<footer role="contentinfo" class="composite-layer">
+ <div class="u-mt-16 u-mb-16">
+ <div class="u-container">
+ <div class="u-display-flex u-flex-wrap u-justify-content-space-between">
+ <p class="c-meta u-ma-0 u-mr-24">
+
+</p>
+
+ <p class="c-meta u-ma-0">
+ <span aria-level="2" class="c-meta__item" itemprop="name">
+ Nature
+ </span>
+ <span class="c-meta__item">
+ <abbr title="International Standard Serial Number">ISSN</abbr> <span itemprop="issn">1476-4687</span> (online)
+ </span>
+ </p>
+ </div>
+ </div>
+</div>
+
+
+ <div itemscope itemtype="http://schema.org/Periodical">
+ <meta itemprop="publisher" content="Springer Nature">
+ <div class="c-footer">
+ <div class="u-container">
+ <div class="u-hide-print" data-track-component="footer">
+ <h2 aria-level="2" class="u-visually-hidden">nature.com sitemap</h2>
+ <div class="c-footer__header">
+ <div class="c-footer__logo">
+ <img alt="Nature Research" src="/static/images/logos/nature research-white-150.f4acf77e0c.svg" loading="lazy" width="200" height="26">
+ </div>
+ <ul class="c-menu c-menu--inherit u-mr-32">
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/company_info/index.html" data-track="click" data-track-action="about us" data-track-label="link">About us</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/press_room/press_releases.html" data-track="click" data-track-action="press releases" data-track-label="link">Press releases</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://press.nature.com/" data-track="click" data-track-action="press office" data-track-label="link">Press office</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://support.nature.com/support/home" data-track="click" data-track-action="contact us" data-track-label="link">Contact us</a></li>
+ </ul>
+ <ul class="c-menu c-menu--inherit">
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.facebook.com/nature/" aria-label="Nature on Facebook" data-track="click" data-track-action="facebook" data-track-label="link">
+ <svg class="u-icon u-mt-2 u-mb-2" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 20 20"><path d="M2.5 20C1.1 20 0 18.9 0 17.5v-15C0 1.1 1.1 0 2.5 0h15C18.9 0 20 1.1 20 2.5v15c0 1.4-1.1 2.5-2.5 2.5h-3.7v-7.7h2.6l.4-3h-3v-2c0-.9.2-1.5 1.5-1.5h1.6V3.1c-.3 0-1.2-.1-2.3-.1-2.3 0-3.9 1.4-3.9 4v2.2H8.1v3h2.6V20H2.5z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://twitter.com/nresearchnews?lang=en" aria-label="Nature on Twitter" data-track="click" data-track-action="twitter" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M17.6 4.1c.8-.5 1.5-1.4 1.8-2.4-.8.5-1.7.9-2.6 1-.7-.8-1.8-1.4-3-1.4-2.3 0-4.1 1.9-4.1 4.3 0 .3 0 .7.1 1-3.4 0-6.4-1.8-8.4-4.4C1 2.9.8 3.6.8 4.4c0 1.5.7 2.8 1.8 3.6C2 8 1.4 7.8.8 7.5v.1c0 2.1 1.4 3.8 3.3 4.2-.3.1-.7.2-1.1.2-.3 0-.5 0-.8-.1.5 1.7 2 3 3.8 3-1.3 1.1-3.1 1.8-5 1.8-.3 0-.7 0-1-.1 1.8 1.2 4 1.9 6.3 1.9C13.8 18.6 18 12 18 6.3v-.6c.8-.6 1.5-1.4 2-2.2-.7.3-1.5.5-2.4.6z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.youtube.com/channel/UCvCLdSgYdSTpWcOgEJgi-ng" aria-label="Nature on YouTube" data-track="click" data-track-action="youtube" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M7.9 12.6V6.9l5.4 2.8c0 .1-5.4 2.9-5.4 2.9zM19.8 6s-.2-1.4-.8-2c-.8-.8-1.6-.8-2-.9-2.8-.2-7-.2-7-.2s-4.2 0-7 .2c-.4 0-1.2 0-2 .9-.6.6-.8 2-.8 2S0 7.6 0 9.2v1.5c0 1.7.2 3.3.2 3.3s.2 1.4.8 2c.8.8 1.8.8 2.2.9 1.6.1 6.8.2 6.8.2s4.2 0 7-.2c.4 0 1.2-.1 2-.9.6-.6.8-2 .8-2s.2-1.6.2-3.3V9.2c0-1.6-.2-3.2-.2-3.2z"/></svg>
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="c-footer__grid">
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Discover content</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/siteindex" data-track="click" data-track-action="journals a-z" data-track-label="link">Journals A-Z</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/subjects/" data-track="click" data-track-action="article by subject" data-track-label="link">Articles by subject</a></li>
+ <li class="c-footer__item"><a href="https://nano.nature.com/" data-track="click" data-track-action="nano" data-track-label="link">Nano</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/protocolexchange/" data-track="click" data-track-action="protocol exchange" data-track-label="link">Protocol Exchange</a></li>
+ <li class="c-footer__item"><a href="https://www.natureindex.com/" data-track="click" data-track-action="nature index" data-track-label="link">Nature Index</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Publish with us</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/author_resources/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to Authors</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/peer_review/" data-track="click" data-track-action="guide to referees" data-track-label="link">Guide to Referees</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/publishing-with-npg/" data-track="click" data-track-action="open access" data-track-label="link">Open access</a></li>
+ <li ><a href="https://www.nature.com/reprints/" data-track="click" data-track-action="reprints and permissions" data-track-label="link">Reprints &amp; permissions</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Researcher services</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/authors/research-data" data-track="click" data-track-action="data research service" data-track-label="link">Research data</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/go/nr" data-track="click" data-track-action="language editing" data-track-label="link">Language editing</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/scientific-editing/" data-track="click" data-track-action="scientific editing" data-track-label="link">Scientific editing</a></li>
+ <li class="c-footer__item"><a href="https://masterclasses.nature.com/" data-track="click" data-track-action="nature masterclasses" data-track-label="link">Nature Masterclasses</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/researcher-training/" data-track="click" data-track-action="nature research academies" data-track-label="link">Nature Research Academies</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Libraries &amp; institutions</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/tools-services" data-track="click" data-track-action="librarian service and tools" data-track-label="link">Librarian service &amp; tools</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/manage-your-account/librarianportal" data-track="click" data-track-action="librarian portal" data-track-label="link">Librarian portal</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/about-open-access/information-for-institutions/" data-track="click" data-track-action="open research" data-track-label="link">Open research</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Advertising &amp; partnerships</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/digital-advertising/" data-track="click" data-track-action="advertising" data-track-label="link">Advertising</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/" data-track="click" data-track-action="partnerships and services" data-track-label="link">Partnerships &amp; Services</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/media-kits/" data-track="click" data-track-action="media kits" data-track-label="link">Media kits</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/branded-content-native-advertising/" data-track-action="branded content" data-track-label="link">Branded content</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Career development</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/naturecareers" data-track="click" data-track-action="nature careers" data-track-label="link">Nature Careers</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureconferences/" data-track="click" data-track-action="nature conferences" data-track-label="link">Nature<span class="visually-hidden"> </span> Conferences</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureevents/" data-track="click" data-track-action="nature events" data-track-label="link">Nature<span class="visually-hidden"> </span> events</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Regional websites</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="http://www.naturechina.com" data-track="click" data-track-action="nature china" data-track-label="link">Nature China</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nindia" data-track="click" data-track-action="nature india" data-track-label="link">Nature India</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ja-jp/" data-track="click" data-track-action="nature japan" data-track-label="link">Nature Japan</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ko-kr/" data-track="click" data-track-action="nature korea" data-track-label="link">Nature Korea</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nmiddleeast/" data-track="click" data-track-action="nature middle east" data-track-label="link">Nature Middle East</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Legal &amp; Privacy</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/info/privacy.html" data-track="click" data-track-action="privacy policy" data-track-label="link">Privacy Policy</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/cookies.html" data-track="click" data-track-action="use of cookies" data-track-label="link">Use of cookies</a></li>
+ <li class="c-footer__item"><a class="optanon-toggle-display" href="javascript:;" data-track="click" data-track-action="manage cookies" data-track-label="link">Manage cookies/Do not sell my data</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/legal_notice.html" data-track="click" data-track-action="legal notice" data-track-label="link">Legal notice</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/accessibility_statement.html" data-track="click" data-track-action="accessibility statement" data-track-label="link">Accessibility statement</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/tandc.html" data-track="click" data-track-action="terms and conditions" data-track-label="link">Terms &amp; Conditions</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/ccpa" data-track="click" data-track-action="california privacy statement" data-track-label="link">California Privacy Statement</a></li>
+ </ul>
+ </div>
+ </div>
+</div>
+
+
+ </div>
+ </div>
+ </div>
+
+ <div class="c-corporate-footer">
+ <div class="u-container">
+ <img src="/static/images/logos/sn-logo-white.ea63208b81.svg" alt="Springer Nature" loading="lazy" width="140" height="14"/>
+ <p class="c-corporate-footer__legal" data-test="copyright">&copy; 2020 Springer Nature Limited</p>
+ </div>
+</div>
+
+
+ <svg class="u-hide hide">
+ <symbol id="global-icon-chevron-right" viewBox="0 0 16 16">
+ <path d="M7.782 7L5.3 4.518c-.393-.392-.4-1.022-.02-1.403a1.001 1.001 0 011.417 0l4.176 4.177a1.001 1.001 0 010 1.416l-4.176 4.177a.991.991 0 01-1.4.016 1 1 0 01.003-1.42L7.782 9l1.013-.998z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-download" viewBox="0 0 16 16">
+ <path d="M2 14c0-.556.449-1 1.002-1h9.996a.999.999 0 110 2H3.002A1.006 1.006 0 012 14zM9 2v6.8l2.482-2.482c.392-.392 1.022-.4 1.403-.02a1.001 1.001 0 010 1.417l-4.177 4.177a1.001 1.001 0 01-1.416 0L3.115 7.715a.991.991 0 01-.016-1.4 1 1 0 011.42.003L7 8.8V2c0-.55.444-.996 1-.996.552 0 1 .445 1 .996z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-email" viewBox="0 0 18 18">
+ <path d="M1.995 2h14.01A2 2 0 0118 4.006v9.988A2 2 0 0116.005 16H1.995A2 2 0 010 13.994V4.006A2 2 0 011.995 2zM1 13.994A1 1 0 001.995 15h14.01A1 1 0 0017 13.994V4.006A1 1 0 0016.005 3H1.995A1 1 0 001 4.006zM9 11L2 7V5.557l7 4 7-4V7z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-institution" viewBox="0 0 18 18">
+ <path d="M14 8a1 1 0 011 1v6h1.5a.5.5 0 01.5.5v.5h.5a.5.5 0 01.5.5V18H0v-1.5a.5.5 0 01.5-.5H1v-.5a.5.5 0 01.5-.5H3V9a1 1 0 112 0v6h8V9a1 1 0 011-1zM6 8l2 1v4l-2 1zm6 0v6l-2-1V9zM9.573.401l7.036 4.925A.92.92 0 0116.081 7H1.92a.92.92 0 01-.528-1.674L8.427.401a1 1 0 011.146 0zM9 2.441L5.345 5h7.31z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-search" viewBox="0 0 22 22">
+ <path fill-rule="evenodd" d="M21.697 20.261a1.028 1.028 0 01.01 1.448 1.034 1.034 0 01-1.448-.01l-4.267-4.267A9.812 9.811 0 010 9.812a9.812 9.811 0 1117.43 6.182zM9.812 18.222A8.41 8.41 0 109.81 1.403a8.41 8.41 0 000 16.82z"/>
+ </symbol>
+ <symbol id="global-icon-info" viewBox="0 0 18 18">
+ <path d="m9 0c4.9705627 0 9 4.02943725 9 9 0 4.9705627-4.0294373 9-9 9-4.97056275 0-9-4.0294373-9-9 0-4.97056275 4.02943725-9 9-9zm0 7h-1.5l-.11662113.00672773c-.49733868.05776511-.88337887.48043643-.88337887.99327227 0 .47338693.32893365.86994729.77070917.97358929l.1126697.01968298.11662113.00672773h.5v3h-.5l-.11662113.0067277c-.42082504.0488782-.76196299.3590206-.85696816.7639815l-.01968298.1126697-.00672773.1166211.00672773.1166211c.04887817.4208251.35902055.761963.76398144.8569682l.1126697.019683.11662113.0067277h3l.1166211-.0067277c.4973387-.0577651.8833789-.4804365.8833789-.9932723 0-.4733869-.3289337-.8699473-.7707092-.9735893l-.1126697-.019683-.1166211-.0067277h-.5v-4l-.00672773-.11662113c-.04887817-.42082504-.35902055-.76196299-.76398144-.85696816l-.1126697-.01968298zm0-3.25c-.69035594 0-1.25.55964406-1.25 1.25s.55964406 1.25 1.25 1.25 1.25-.55964406 1.25-1.25-.55964406-1.25-1.25-1.25z" fill-rule="evenodd"/>
+ </symbol>
+ </svg>
+
+</footer>
+
+
+</body>
+</html>
+
diff --git a/python/tests/files/peerj_oa_article.html b/python/tests/files/peerj_oa_article.html
new file mode 100644
index 0000000..f2cf365
--- /dev/null
+++ b/python/tests/files/peerj_oa_article.html
@@ -0,0 +1,2365 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+ <meta charset="utf-8">
+
+ <title>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles [PeerJ]</title>
+
+
+ <link rel="dns-prefetch" href="https://d2pdyyx74uypu5.cloudfront.net/">
+ <link rel="dns-prefetch" href="http://static.peerj.com/">
+<link rel="dns-prefetch" href="https://doi.org">
+
+
+ <meta name="citation_title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"><meta name="citation_date" content="2018-02-13"><meta name="citation_doi" content="10.7717/peerj.4375"><meta name="citation_language" content="en"><meta name="citation_pdf_url" content="https://peerj.com/articles/4375.pdf"><meta name="citation_fulltext_html_url" content="https://peerj.com/articles/4375"><meta name="citation_volume" content="6"><meta name="citation_firstpage" content="e4375"><meta name="citation_keywords" content="Open access; Open science; Scientometrics; Publishing; Libraries; Scholarly communication; Bibliometrics; Science policy"><meta name="citation_journal_title" content="PeerJ"><meta name="citation_journal_abbrev" content="PeerJ"><meta name="citation_publisher" content="PeerJ Inc."><meta name="citation_issn" content="2167-8359"><meta name="citation_author" content="Heather Piwowar"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="heather@impactstory.org"><meta name="citation_author" content="Jason Priem"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="jason@impactstory.org"><meta name="citation_author" content="Vincent Larivière"><meta name="citation_author_institution" content="École de bibliothéconomie et des sciences de l’information, Université de Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author" content="Juan Pablo Alperin"><meta name="citation_author_institution" content="Canadian Institute for Studies in Publishing, Simon Fraser University, Vancouver, BC, Canada"><meta name="citation_author_institution" content="Public Knowledge Project, Canada"><meta name="citation_author" content="Lisa Matthias"><meta name="citation_author_institution" content="Scholarly Communications Lab, Simon Fraser University, Vancouver, Canada"><meta name="citation_author" content="Bree Norlander"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Ashley Farley"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Jevin West"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author" content="Stefanie Haustein"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="School of Information Studies, University of Ottawa, Ottawa, ON, Canada">
+ <meta name="description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+
+ <meta property="og:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+ <meta name="twitter:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:url" content="https://peerj.com/articles/4375">
+ <meta name="twitter:site" content="@thePeerJ">
+ <meta name="twitter:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta name="twitter:description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+ <meta property="og:type" content="article">
+ <meta property="og:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta property="og:url" content="https://peerj.com/articles/4375">
+ <meta property="og:site_name" content="PeerJ">
+
+
+ <link rel="alternate" type="application/pdf" href="/articles/4375.pdf">
+ <link rel="alternate" type="application/rdf+xml" href="/articles/4375.rdf">
+ <link rel="alternate" type="application/json" href="/articles/4375.json">
+ <link rel="alternate" type="application/xml" href="/articles/4375.xml">
+ <link rel="alternate" type="application/unixref+xml" href="/articles/4375.unixref">
+ <link rel="alternate" type="application/vnd.citationstyles.csl+json" href="/articles/4375.citeproc">
+ <link rel="alternate" type="application/bibjson+json" href="/articles/4375.bibjson">
+ <link rel="alternate" type="text/html" href="/articles/4375.html">
+
+ <link rel="canonical" href="https://peerj.com/articles/4375/">
+
+ <meta name="viewport" content="width=device-width,initial-scale=1">
+ <meta property="fb:app_id" content="534542813234464">
+
+ <link rel="stylesheet" href="/css/05b9c3d-27443c7.css" media="screen">
+
+<!--[if lt IE 9]>
+ <link rel="stylesheet" href="/assets/css/ie8.css" media="screen">
+<![endif]-->
+
+<!--[if lt IE 10]>
+ <link rel="stylesheet" href="/assets/css/ie9.css" media="screen">
+<![endif]-->
+
+ <style media="screen">html, body { height: 100%; }</style>
+ <link rel="stylesheet" href="https://cdn.peerj.com/webpack/vue-bundle.2cdd25e1.css">
+
+
+ <link rel="stylesheet" href="/css/a0c1a2c-04690d8.css" media="screen">
+
+ <link rel="stylesheet" href="/css/be477b9-1134171.css" media="screen">
+ <link rel="stylesheet" href="/css/3e4ba6d-c134b5f.css" media="print">
+ <script src="/js/36e5d51-2d7025c.js"></script>
+<script src="/assets/js/polyfills/includes.js"></script>
+<script src="/assets/js/polyfills/startsWith.js"></script><!--[if lt IE 9]>
+<script src="/assets/js/html5shiv.js"></script>
+
+<![endif]-->
+
+<!--[if lt IE 8]>
+<script src="/assets/js/json2.js"></script>
+<![endif]-->
+
+<script>
+ var PeerJ = {
+ Article: {},
+ User: {
+ anonymous: true },
+ Publication: {},
+ Production: {},
+ Event: {},
+ Com: {},
+ Payment: {},
+ Annotation: {},
+ Search: {},
+ Home: {},
+ Subjects: {},
+ Advocacy: {},
+ Job: {},
+ ContentAlert: {},
+ Tools: {}
+ };
+</script>
+
+
+<script>
+ var campaign_keywords = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term'];
+ var kw = '';
+ var lastUtms = {};
+ var firstUtms = {};
+ var allUtms = {};
+
+ function campaignParams() {
+ var index;
+ for (index = 0; index < campaign_keywords.length; ++index) {
+ kw = getQueryParam(document.URL, campaign_keywords[index]);
+ if (kw.length) {
+ lastUtms[campaign_keywords[index] + '-last'] = kw;
+ firstUtms[campaign_keywords[index] + '-first'] = kw;
+ allUtms[campaign_keywords[index] + '-all'] = kw;
+ }
+ }
+ }
+
+ function updatePreregCookie(preregCookie, firstUtmKey) {
+ var utmVal = firstUtms[firstUtmKey];
+ if (utmVal) {
+ var existingPreregCampaign = $.cookie(preregCookie);
+ var appendPreregCampaign;
+ if (!existingPreregCampaign) {
+ appendPreregCampaign = utmVal;
+ } else {
+ appendPreregCampaign = existingPreregCampaign + ',' + utmVal;
+
+ }
+ $.cookie(preregCookie, appendPreregCampaign, {expires: 365, path: "/"});
+ }
+ }
+
+ function getQueryParam(url, param) {
+ // Expects a raw URL
+ param = param.replace(/[[]/, "\[").replace(/[]]/, "\]");
+ var regexS = "[\?&]" + param + "=([^&#]*)",
+ regex = new RegExp( regexS ),
+ results = regex.exec(url);
+ if (results === null || (results && typeof(results[1]) !== 'string' && results[1].length)) {
+ return '';
+ } else {
+ return decodeURIComponent(results[1]).replace(/\W/gi, ' ');
+ }
+ }
+
+ function articlePageEvent() {
+ var articleContainer = $('.publication-jsondata');
+ if (articleContainer.length) {
+ var data = articleContainer.data('publication-meta');
+
+ // Must be public
+ if (data.publicationSubjects.length) {
+
+ var eventName = 'Viewed-article';
+ var preprint = data.preprint;
+ if (preprint) {
+ eventName = 'Viewed-preprint';
+ }
+
+ data['ip-hash'] = 'bf3914b8088a79fb1fcf39cb526631c0';
+ mixpanel.track(eventName, data);
+ }
+ }
+ }
+
+ function sectionListViewEvent() {
+ }
+</script>
+ <script>
+ // User agrees to terms on signup, so Mixpanel is OK
+ // On submit, update mixpanel distinct id
+ setTimeout(function () {
+ var regmixpanel = document.getElementById('fos_user_registration_form_mixpanelId');
+ if (regmixpanel) {
+ var distinctId = $.cookie('pj_mp_distinct');
+ if (!distinctId) {
+ distinctId = mixpanel.get_distinct_id();
+ }
+ console.log(distinctId);
+ regmixpanel.value = distinctId;
+ }
+ }, 1500);
+
+ // If logged out then check if consented to analytics cookies (if applicable to country)
+ // Run through cookieConsent only
+ PeerJ.Com.Mixpanel = new function() {
+ this.leadView = function() {
+ mixpanel.init('776a79e14e8f05a81ca92536c83f08b4', {
+ 'secure_cookie': true,
+ loaded: function (mixpanel) {
+ setTimeout(function () {
+ articlePageEvent();
+
+ sectionListViewEvent();
+
+
+
+ }, 1000);
+ }
+ });
+ }
+ };
+
+ campaignParams();
+ updatePreregCookie('pj_prereg_campaign', 'utm_campaign-first');
+ updatePreregCookie('pj_prereg_content', 'utm_content-first');
+ updatePreregCookie('pj_prereg_term', 'utm_term-first');
+ </script>
+
+
+
+ <script>(function(p,u,s,h,x){p.pushpad=p.pushpad||function(){(p.pushpad.q=p.pushpad.q||[]).push(arguments)};h=u.getElementsByTagName('head')[0];x=u.createElement('script');x.async=1;x.src=s;h.appendChild(x);})(window,document,'https://pushpad.xyz/pushpad.js');
+pushpad('init', 5977, {hostname: 'peerj.com'});
+</script>
+
+ <link rel="search" type="application/opensearchdescription+xml" href="https://peerj.com/articles/osd.xml" title="PeerJ">
+
+
+
+
+
+ <script>
+ // Run through cookieConsent only
+ PeerJ.Com.GA = new function() {
+ this.disabletracking = function() {
+ window['ga-disable-' + 'UA-31208920-1'] = true;
+ };
+
+ this.runGA = function() {
+ (function (i, s, o, g, r, a, m) {
+ i['GoogleAnalyticsObject'] = r;
+ i[r] = i[r] || function () {
+ (i[r].q = i[r].q || []).push(arguments)
+ }, i[r].l = 1 * new Date();
+ a = s.createElement(o),
+ m = s.getElementsByTagName(o)[0];
+ a.async = 1;
+ a.src = g;
+ m.parentNode.insertBefore(a, m)
+ })(window, document, 'script', 'https://www.google-analytics.com/analytics.js', 'ga');
+
+ ga('create', 'UA\u002D31208920\u002D1', 'auto');
+
+ // Removes last octet
+ ga('set', 'anonymizeIp', true);
+
+
+
+
+
+
+
+
+
+ ga('set', 'dimension4', ';Legal\u0020Issues\u003BScience\u0020Policy\u003BData\u0020Science;');
+
+ ga('require', 'displayfeatures');
+
+ ga('send', 'pageview');
+
+ window.setTimeout(function () {
+ ga('send', 'event', 'adjusted bounce rate', 'page visit 15 seconds or more');
+ }, 15000);
+
+
+ }
+ };
+ </script>
+ <script src="/js/8548491-f0f5b7c.js"></script>
+
+<link rel="apple-touch-icon" sizes="57x57" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-57x57.png">
+<link rel="apple-touch-icon" sizes="60x60" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-60x60.png">
+<link rel="apple-touch-icon" sizes="72x72" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-72x72.png">
+<link rel="apple-touch-icon" sizes="76x76" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-76x76.png">
+<link rel="apple-touch-icon" sizes="114x114" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-114x114.png">
+<link rel="apple-touch-icon" sizes="120x120" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-120x120.png">
+<link rel="apple-touch-icon" sizes="144x144" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-144x144.png">
+<link rel="apple-touch-icon" sizes="152x152" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-152x152.png">
+<link rel="apple-touch-icon" sizes="180x180" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-180x180.png">
+<link rel="icon" type="image/png" sizes="192x192" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/android-icon-192x192.png">
+<link rel="shortcut icon" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon.ico">
+<link rel="icon" type="image/png" sizes="32x32" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-32x32.png">
+<link rel="icon" type="image/png" sizes="96x96" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-96x96.png">
+<link rel="icon" type="image/png" sizes="16x16" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-16x16.png">
+<link rel="manifest" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/manifest.json">
+<meta name="msapplication-TileColor" content="#ffffff">
+<meta name="msapplication-TileImage" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/ms-icon-144x144.png">
+<meta name="msapplication-config" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/browserconfig.xml">
+<meta name="theme-color" content="#ffffff"></head>
+
+<body class="">
+
+ <!-- FreshDesk variable (TODO: move elsewhere) -->
+
+
+<nav class="navbar navbar-fixed-top navbar-inverse navbar-alpha" role="navigation"><div class="navbar-inner"><!-- .btn-navbar is used as the toggle for collapsed navbar content --><a class="btn btn-navbar pull-right" data-toggle="collapse" data-target=".nav-collapse"><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></a><!-- logo --><ul class="nav pull-left nav-sections nav-journal"><li class="dropdown"><a href="/" class="dropdown-toggle "
+ data-toggle="dropdown"><span id="navJournalTitle">PeerJ Journals</span><b class="caret"></b></a><ul class="dropdown-menu journal-list"><li><a href="/">PeerJ Publishing Overview</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">PeerJ – Life & Environment</a><ul class="dropdown-menu"><li><a href="/sections/">About the journal Sections</a></li><li class="divider"></li><li><a href="/sections/aquatic-biology/">Aquatic Biology</a></li><li><a href="/sections/biochemistry-biophysics-molecular-biology/">Biochemistry, Biophysics and Molecular Biology</a></li><li><a href="/sections/biodiversity-conservation/">Biodiversity and Conservation</a></li><li><a href="/sections/bioinformatics-genomics/">Bioinformatics and Genomics</a></li><li><a href="/sections/brain-cognition/">Brain and Cognition</a></li><li><a href="/sections/ecology/">Ecology</a></li><li><a href="/sections/environ-sci/">Environmental Science</a></li><li><a href="/sections/microbiology/">Microbiology</a></li><li><a href="/sections/paleontology-evolutionary-science/">Paleontology and Evolutionary Science</a></li><li><a href="/sections/plant-biology/">Plant Biology</a></li><li><a href="/sections/zoological-science/">Zoological Science</a></li></ul></li><li><a href="/computer-science/">
+ PeerJ Computer Science
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Physical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Organic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Inorganic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Analytical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Materials Science
+ </a></li><li class="divider"></li><li><a href="https://peerj.org/" target="_blank">Visit PeerJ.org and get involved</a></li></ul></li></ul><!-- mobile-only top nav items --><ul class="nav pull-left nav-about-phone hidden-desktop"><li class="dropdown"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><li><!-- checkout items --></li><li><!-- notifications --></li></ul><!-- sections --><ul class="nav pull-left nav-collapse nav-sections nav-sections-main collapse search-hide"><li class="dropdown visible-desktop"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><!-- more --><li class="dropdown"><a href="#" class="dropdown-toggle"
+ data-toggle="dropdown">More <b class="caret"></b></a><ul class="dropdown-menu" role="menu" aria-labelledby="dLabel"><li><a href="/expertrxiv/"><img src="/assets/images/icons/expertrxiv.png" style="width: 80px"/></a></li><li><a href="/subjects/">Subjects</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Search articles</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Peer-reviewed Journals</li><li><a tabindex="-1" href="/articles/?journal=peerj">PeerJ (Life, Biological, Environmental and Health Sciences)</a></li><li><a tabindex="-1" href="/articles/?journal=cs">PeerJ Computer Science</a></li><li><a tabindex="-1" href="/articles/?journal=pchem">PeerJ Physical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ochem">PeerJ Organic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ichem">PeerJ Inorganic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=achem">PeerJ Analytical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=matsci">PeerJ Materials Science</a></li><li role="presentation" class="dropdown-header">Preprints</li><li><a tabindex="-1" href="/preprints/">PeerJ Preprints</a></li></ul></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Table of contents</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Table of Contents - current and archives</li><li><a tabindex="-1" href="/medicine/">PeerJ - Medicine articles</a></li><li><a tabindex="-1" href="/biology/">PeerJ - Biology & Life science articles</a></li><li><a tabindex="-1" href="/environment/">PeerJ - Environmental Science articles</a></li><li><a tabindex="-1" href="/general/">PeerJ - General bio (stats, legal, policy, edu)</a></li><li class="divider"></li><li><a tabindex="-1" href="/cs/">PeerJ Computer Science</a></li><li class="divider"></li><li><a tabindex="-1" href="/preprints-toc/">PeerJ Preprints</a></li></ul></li><li><a href="/academic-boards/advisors/">Academic advisors</a></li><li><a href="/reviewer-match/">Volunteer to review</a></li><li><a href="/collections/">Collections</a></li><li><a href="/questions/">Discussions</a></li><li><a href="https://peerj.com/blog/">Blog</a></li><li><a href="/prepaid-publishing/">Prepaid Publishing</a></li><li><a href="/about/reviews/">Reviews and awards</a></li><li><a href="/spread-the-word/">Spread the word</a></li><li><a href="/about/">Who are we?</a></li><li><a href="/about/contact/">Contact</a></li></ul></li></ul><!-- search --><div class="nav nav-collapse collapse pull-right nav-search"><form class="navbar-search" action="/search/"><input name="q" type="search"
+ data-autocomplete-url="/search/"
+ class="search-query" placeholder="Search"><!--<i class="icon-search"></i>--></form></div><ul class="nav pull-right nav-collapse collapse search-hide nav-utilities"><!-- login desktop --><li><a id="front-page-login" href="/login">Login</a></li></ul><ul class="nav pull-right search-hide nav-shifter"></ul><!-- for authors, my manuscripts --><ul class="nav nav-center nav-collapse collapse search-hide pull-right"><!-- for authors --><li class="dropdown nav-authors"><a href="#" class="dropdown-toggle" data-toggle="dropdown"><i
+ class="icon-info4 icon-large nav-icon icomoon"></i><span class="visible-wide">AUTHORS</span><b class="caret"></b></a><ul class="dropdown-menu"><li><a href="/benefits/">Peer Journals Overview</a></li><li><a href="/about/author-instructions/">Submission Guidelines</a></li><li><a href="/subjects/">Subject Areas</a></li><li><a href="/academic-boards/">Editorial Board</a></li><li><a href="/about/editorial-criteria/">Editorial Criteria</a></li><li><a href="/pricing/">Pricing</a></li><li><a href="/about/FAQ/">General FAQ</a></li><li><a href="/computer-science/faq-cs/">Computer Science FAQ</a></li><li><a href="/about/aims-and-scope/">Aims and Scope</a></li><li><a href="/about/author-interviews/">Author Interviews</a></li><li><a href="/about/policies-and-procedures/">Policies and Procedures</a></li><!--<li><a href="#">Why PeerJ?</a></li>--></ul></li><!-- my manuscripts --><!-- note: dropdown classes used just to maintain display --><li class="nav-manuscripts dropdown"><a href="/new/" class="dropdown-toggle"><span>SUBMIT ARTICLE</span></a></li></ul></div></nav>
+
+ <div class="item-top-navbar">
+ <div class="item-top-navbar-inner">
+ <div class="container-fluid">
+ <div class="row-fluid">
+ <div class="span12">
+ <div class="item-metrics-counts-top-nav article-item-metrics-counts">
+ <span class="article-item-metrics-count visible-all">
+ <span data-count="citations">203</span>
+ <span class="article-item-metrics-label">Citations</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-html">&nbsp;</span>
+ <span class="article-item-metrics-label">Views</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-pdf">&nbsp;</span>
+ <span class="article-item-metrics-label">Downloads</span>
+ </span>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</div>
+
+ <div id="wrap">
+
+
+
+ <div id="nav-pad"></div>
+
+
+ <div class="container">
+
+ <noscript class="js-disabled-warning">
+ <div class="alert alert-danger">
+ <i class="icon icon-warning-sign"></i> Javascript is disabled in your browser. Please <a href="https://www.enable-javascript.com" target="_blank">enable Javascript</a> to view PeerJ.
+ </div>
+ </noscript>
+
+
+ <div class="row publication-jsondata" data-publication-meta="{&quot;publicationId&quot;:&quot;4375&quot;,&quot;Article-section&quot;:&quot;NA&quot;,&quot;journal&quot;:&quot;PeerJ&quot;,&quot;published&quot;:&quot;2018-02-13 08:54:18&quot;,&quot;preprint&quot;:false,&quot;publicationSubjects&quot;:[&quot;Legal Issues&quot;,&quot;Science Policy&quot;,&quot;Data Science&quot;],&quot;publicationInstitutions&quot;:[&quot;Simon Fraser University&quot;,&quot;University of Washington&quot;,&quot;University of Ottawa&quot;],&quot;publicationTop20Institution&quot;:true,&quot;publicationInstitutionPlan&quot;:true}">
+ <!-- Left sidebar -->
+ <div class="span1 article-sidebar">
+ <div class="article-sidebar-left">
+ <div class="sidebar-box sidebar-box--journal">
+ <a href="/" class="sidebar-box--journal-mask"></a>
+ <img src="https://d2pdyyx74uypu5.cloudfront.net/images/article/logos/article-logo-peerj.png">
+ </div>
+
+ <div id="btn-view-tweets" class="sidebar-box sidebar-box--tweet">
+ <div class="text-center">View 618 tweets <i class="icon-twitter"></i></div>
+ </div>
+
+ <a href="#related-research" class="sidebar-box sidebar-box--related text-center">
+ Related research
+ <i class="icon-angle-down"></i>
+ </a>
+
+ <!-- mobile only -->
+ <div class="item-leftside-actions">
+ <div class="sidebar-box sidebar-box--action js-download-modal-trigger">Download</div>
+
+ <div id="notification-actions-mobile" class="sidebar-box sidebar-box--action" data-href="/following/publication/4522/">
+ <span class="follow-btn " id="item-left-follow-btn"
+ title="Receive article updates" data-toggle="tooltip" data-success-modal="#followModal"
+ data-href="/follow/publication/4522/0/">
+ <span class="button_text_follow">Follow</span class="follow-btn publication-label publication-label-general publication-label-middle" id="item-left-follow-btn"
+ ></span>
+</div>
+
+
+
+ <div class="sidebar-box sidebar-box--social visible-desktop">
+ <div class="sidebar-box--social-title">Share</div>
+ <div class="d-flex">
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </div>
+</div>
+
+<div class="btn-group sidebar-box sidebar-box--action">
+ <a href="#" class="btn-share dropdown-toggle" data-toggle="dropdown">Share</a>
+
+ <ul class="dropdown-menu">
+ <li>
+ <a href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+ </ul>
+</div>
+
+ </div>
+
+ </div>
+
+ <div class="peer-reviewed visible-phone">
+ <i class="icon-ok"></i> PEER-REVIEWED
+ </div>
+
+ </div>
+
+ <div id="annotations-sidebar" class="span5"></div>
+
+ <!-- Middle col -->
+ <div id="article-item-middle" class="span7"
+ data-ms-type-entity="articles" data-ms-type-id="research-article" data-ms-type-text="Research-article">
+
+ <div id="article-tweets-container">
+ <div class="row-fluid article-tweets-header">
+ <div class="span9">
+ <h2><em>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</em></h2>
+ </div>
+ <div class="span3">
+ <div class="btn btn-inverse pull-right" id="btn-view-article"><span class="icon-file"></span> View article</div>
+ </div>
+ </div>
+ <div class="tweet-items"> <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1297703289707016194/-sYklkZs_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=164969574" target="_blank"><strong></strong> <span class="twitter-handle">@LorenAndreaEP</span></a>
+ <span class="item-tweet-date">11 days ago</span>
+ </div>
+ <div>RT @AMAldanaS: También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradore…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1293635358064807937/YCE7J6e-_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15271321" target="_blank"><strong>Rachel Borchardt</strong> <span class="twitter-handle">@ButternutSquash</span></a>
+ <span class="item-tweet-date">12 days ago</span>
+ </div>
+ <div>@ces43 May I recommend Piwowar and Priem et al&#039;s article for that topic? https://t.co/Fnm0vtYtKS</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1210228942415814656/L6yRkSyu_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1117109826" target="_blank"><strong>Ana M. Aldana</strong> <span class="twitter-handle">@AMAldanaS</span></a>
+ <span class="item-tweet-date">40 days ago</span>
+ </div>
+ <div>También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradores de 2018 en donde se evidencia la ventaja de publicar en green open access: . https://t.co/1HAmYlfoBP</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/982225468286840837/BM5R0jJh_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=982223918223130624" target="_blank"><strong>Scicomm</strong> <span class="twitter-handle">@ScicommBot</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/856499301358477312/GLL-DiUg_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=850296415708471297" target="_blank"><strong>Open Pharma</strong> <span class="twitter-handle">@_OpenPharma</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">102 days ago</span>
+ </div>
+ <div>@Mietmensch @unpaywall Gotcha. It&#039;s tough to generalize the answer to that, as it depends a lot on the specific journal and field. We dove into the details more in this paper, though: https://t.co/HRus7k3P0B</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">103 days ago</span>
+ </div>
+ <div>@dwhly @unpaywall @hpiwowar historical stats are in here: https://t.co/HRus7k3P0B
+
+prediction for future is here: https://t.co/ex0vvThc9G</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/456347532637896704/We-tZ-rF_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=13616592" target="_blank"><strong>Eric Sieverts</strong> <span class="twitter-handle">@sieverts</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/633201529575632897/5rB4RNtd_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=163244377" target="_blank"><strong>Hector Keun</strong> <span class="twitter-handle">@hectorkeun</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @OxonAndrew: A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact o…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1233869298344611840/suKOWJtS_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1024381399447613443" target="_blank"><strong>Asynchrony</strong> <span class="twitter-handle">@temporalization</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @egonwillighagen: the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJ…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/447652981291614208/RtR2dZtC_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=536409536" target="_blank"><strong>Andrew Singer</strong> <span class="twitter-handle">@OxonAndrew</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles†â¦@thePeerJâ© https://t.co/yCu96hCzMK</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJV72Uf https://t.co/DE9MPIKTdZ</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>@egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for values.</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1220321309411942408/nhm-dSur_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1215236299344502791" target="_blank"><strong>Open Science Community Maastricht</strong> <span class="twitter-handle">@OSCMaastricht</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1263564961068077059/CKFX9dV2_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=371391064" target="_blank"><strong>Marie E McVeigh</strong> <span class="twitter-handle">@JopieNet</span></a>
+ <span class="item-tweet-date">121 days ago</span>
+ </div>
+ <div>@lisalibrarian @ashleydfarley @andy_nobes Usual def of &quot;bronze&quot; in @our_research is free to read, but does not have CC license.
+https://t.co/T34fQja0nN</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">146 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+
+<div class="tweet-pagination pagination">
+
+ <ul>
+
+ <li class="active"><a href="#">1</a></li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2" class="page">2</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=3" class="page">3</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=4" class="page">4</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=5" class="page">5</a>
+ </li>
+
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2">Next</a>
+ </li>
+ </ul>
+
+ <hr>
+</div></div>
+</div>
+ <div id="article-main-container">
+ <div class="article-section-breadcrumb">
+ <span class="icon-angle-left"></span>
+ <span><a href="/"><em>PeerJ</em></a></span>
+ </div>
+
+
+ <div class="hidden-print">
+
+ <div id="article-preexisting" class="well peerj-paper-well" >
+ <i class="icon-pushpin icon-large"></i> Note that a <a href="/preprints/3119/">Preprint of this article</a> also exists, first published August 2, 2017.
+ </div>
+ </div>
+
+ <!-- Main article -->
+ <article itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle"><header class="article-meta front"><h1 class="article-title" itemprop="name headline">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</h1>
+<div class="article-authors">
+<span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-1" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-1" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Heather</span> <span class="surname" itemprop="familyName">Piwowar</span></span></a><a class="corresp" href="mailto:heather@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-2" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-2" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jason</span> <span class="surname" itemprop="familyName">Priem</span></span></a><a class="corresp" href="mailto:jason@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-3" data-jats-contrib-type="author" itemprop="author"><a href="author-3" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Vincent</span> <span class="surname" itemprop="familyName">Larivière</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-2" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-2">2</a>,<a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-4" data-jats-contrib-type="author" itemprop="author"><a href="author-4" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Juan Pablo</span> <span class="surname" itemprop="familyName">Alperin</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-4" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-4">4</a>,<a class="aff xref" href="#aff-5" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-5">5</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-5" data-jats-contrib-type="author" itemprop="author"><a href="author-5" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Lisa</span> <span class="surname" itemprop="familyName">Matthias</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-6" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-6">6</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-6" data-jats-contrib-type="author" itemprop="author"><a href="author-6" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Bree</span> <span class="surname" itemprop="familyName">Norlander</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-7" data-jats-contrib-type="author" itemprop="author"><a href="author-7" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Ashley</span> <span class="surname" itemprop="familyName">Farley</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-8" data-jats-contrib-type="author" itemprop="author"><a href="author-8" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jevin</span> <span class="surname" itemprop="familyName">West</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-9" data-jats-contrib-type="author" itemprop="author"><a href="author-9" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Stefanie</span> <span class="surname" itemprop="familyName">Haustein</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a>,<a class="aff xref" href="#aff-9" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-9">9</a></sup></span>
+</div>
+<div id="article-information">
+<div class="article-notes">
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-1">
+<span class="article-label-container"><a class="article-label">1</a></span><span itemprop="address"><span class="institution">Impactstory</span>, <span class="city">Sanford</span>, <span class="state">NC</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-2">
+<span class="article-label-container"><a class="article-label">2</a></span><span itemprop="address"><span class="institution">École de bibliothéconomie et des sciences de l’information, Université de Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-3">
+<span class="article-label-container"><a class="article-label">3</a></span><span itemprop="address"><span class="institution">Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-4">
+<span class="article-label-container"><a class="article-label">4</a></span><span itemprop="address"><span class="institution">Canadian Institute for Studies in Publishing, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="state">BC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-5">
+<span class="article-label-container"><a class="article-label">5</a></span><span itemprop="address"><span class="institution">Public Knowledge Project</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-6">
+<span class="article-label-container"><a class="article-label">6</a></span><span itemprop="address"><span class="institution">Scholarly Communications Lab, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-7">
+<span class="article-label-container"><a class="article-label">7</a></span><span itemprop="address"><span class="institution">Information School, University of Washington</span>, <span class="city">Seattle</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-8">
+<span class="article-label-container"><a class="article-label">8</a></span><span itemprop="address"><span class="institution">FlourishOA</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-9">
+<span class="article-label-container"><a class="article-label">9</a></span><span itemprop="address"><span class="institution">School of Information Studies, University of Ottawa</span>, <span class="city">Ottawa</span>, <span class="state">ON</span>, <span class="country">Canada</span></span>
+</div>
+</div>
+<dl class="article-identifiers">
+<dt> DOI</dt>
+<dd>
+<a href="https://doi.org/10.7717/peerj.4375" itemprop="sameAs">10.7717/peerj.4375</a><meta itemprop="sameAs" content="info:doi/10.7717/peerj.4375">
+</dd>
+</dl>
+<dl class="article-dates">
+<dt>Published</dt>
+<dd><time itemprop="datePublished">2018-02-13</time></dd>
+<dt>Accepted</dt>
+<dd><time data-itemprop="dateAccepted">2018-01-25</time></dd>
+<dt>Received</dt>
+<dd><time itemprop="dateCreated">2017-08-09</time></dd>
+</dl>
+<dl class="article-editors">
+<dt>Academic Editor</dt>
+<dd itemprop="editor" itemscope="itemscope" itemtype="http://schema.org/Person"><a itemprop="url" href="editor-1" class="contrib" data-jats-contrib-type="editor"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Robert</span> <span class="surname" itemprop="familyName">McDonald</span></span></a></dd>
+</dl>
+<dl class="article-subjects">
+<dt>Subject Areas</dt>
+<dd>
+<a class="subject" itemprop="about" href="/subjects/?filter=Legal%20Issues">Legal Issues</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Science%20Policy">Science Policy</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Data%20Science">Data Science</a>
+</dd>
+<dt>Keywords</dt>
+<dd>
+<span class="kwd" itemprop="keywords">Open access</span>, <span class="kwd" itemprop="keywords">Open science</span>, <span class="kwd" itemprop="keywords">Scientometrics</span>, <span class="kwd" itemprop="keywords">Publishing</span>, <span class="kwd" itemprop="keywords">Libraries</span>, <span class="kwd" itemprop="keywords">Scholarly communication</span>, <span class="kwd" itemprop="keywords">Bibliometrics</span>, <span class="kwd" itemprop="keywords">Science policy</span>
+</dd>
+</dl>
+<dl class="article-license">
+<dt>Copyright</dt>
+<dd>© <span itemprop="copyrightYear">2018</span> <span itemprop="copyrightHolder">Piwowar et al.</span>
+</dd>
+<dt>Licence</dt>
+<dd>
+ <span class="license-p">This is an open access article distributed under the terms of the <a class="ext-link" href="http://creativecommons.org/licenses/by/4.0/" rel="license" data-jats-ext-link-type="uri">Creative Commons Attribution License</a>, which permits unrestricted use, distribution, reproduction and adaptation in any medium and for any purpose provided that it is properly attributed. For attribution, the original author(s), title, publication source (PeerJ) and either DOI or URL of the article must be cited.</span>
+ </dd>
+</dl>
+<dl class="self-citation">
+<dt>Cite this article</dt>
+<dd>
+<span class="self-citation-authors">Piwowar H, Priem J, Larivière V, Alperin JP, Matthias L, Norlander B, Farley A, West J, Haustein S.</span> <span class="self-citation-year">2018</span>. <span class="self-citation-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</span>. <span itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="self-citation-journal" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ</span></span> <span class="self-citation-volume" itemprop="volumeNumber">6</span></span>:<span class="self-citation-elocation" itemprop="pageStart">e4375</span> <a href="https://doi.org/10.7717/peerj.4375" itemprop="url">https://doi.org/10.7717/peerj.4375</a>
+</dd>
+</dl>
+<div class="alert alert-success view-public-reviews">The authors have chosen to make <a href="/articles/4375/reviews/">the review history of this article</a> public.</div>
+</div>
+<div>
+<h2>Abstract</h2>
+<div class="abstract" itemprop="description">
+ <p>Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.</p>
+ </div>
+</div></header><main><div class="body" lang="en">
+ <section class="sec" id="intro">
+ <h2 class="heading">Introduction</h2>
+ <p id="p-1">The movement to provide open access (OA) to all research literature is now over fifteen years old. In the last few years, several developments suggest that after years of work, a sea change is imminent in OA. First, funding institutions are increasingly mandating OA publishing for grantees. In addition to the US National Institutes of Health, which mandated OA in 2008 (<a class="ext-link" href="https://publicaccess.nih.gov/index.htm" data-jats-ext-link-type="uri">https://publicaccess.nih.gov/index.htm</a>), the Bill and Melinda Gates Foundation (<a class="ext-link" href="http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy" data-jats-ext-link-type="uri">http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy</a>), the European Commission (<a class="ext-link" href="http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf" data-jats-ext-link-type="uri">http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf</a>), the US National Science Foundation (<a class="ext-link" href="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf" data-jats-ext-link-type="uri">https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf</a>), and the Wellcome Trust (<a class="ext-link" href="https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy" data-jats-ext-link-type="uri">https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy</a>), among others, have made OA diffusion mandatory for grantees. Second, several tools have sprung up to build value atop the growing OA corpus. These include discovery platforms like ScienceOpen and 1Science, and browser-based extensions like the Open Access Button, Canary Haz, and Unpaywall. Third, Sci-Hub (a website offering pirate access to full text articles) has built an enormous user base, provoking newly intense conversation around the ethics and efficiency of paywall publishing (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.11366.1" title="Looking into Pandora’s Box: the content of Sci-Hub and its usage [version 1; referees: 2 approved, 2 approved with reservations]" data-jats-ref-type="bibr" data-jats-rid="ref-26">Greshake, 2017</a>). Academic social networks like ResearchGate and Academia.edu now offer authors an increasingly popular but controversial solution to author self-archiving (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2016.08.002" title="Hybrid open access—a longitudinal study" data-jats-ref-type="bibr" data-jats-rid="ref-8">Björk, 2016a</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>). Finally, the increasing growth in the cost of toll-access subscriptions, particularly via so-called “Big Deals†from publishers, has begun to force libraries and other institutions to initiate large-scale subscription cancellations; recent examples include Caltech, the University of Maryland, University of Konstanz, Université de Montréal, and the national system of Peru (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>; <a class="xref xref-bibr" href="https://doi.org/10.1038%2Fnature.2016.21223" title="Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals" data-jats-ref-type="bibr" data-jats-rid="ref-41">Schiermeier &amp; Mega, 2017</a>; <a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/" title="When the wolf finally arrives: big deal cancelations in North American Libraries" data-jats-ref-type="bibr" data-jats-rid="ref-1">Anderson, 2017a</a>; <a class="xref xref-bibr" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/" title="Teurer als die Wissenschaft erlaubt" data-jats-ref-type="bibr" data-jats-rid="ref-47">Université Konstanz, 2014</a>). As the toll-access status quo becomes increasingly unaffordable, institutions are looking to OA as part of their “Plan B†to maintain access to essential literature (<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>).</p>
+ <p id="p-2">Open access is thus provoking a new surge of investment, controversy, and relevance across a wide group of stakeholders. We may be approaching a moment of great importance in the development of OA, and indeed of the scholarly communication system. However, despite the recent flurry of development and conversation around OA, there is a need for large-scale, high-quality data on the growth and composition of the OA literature itself. In particular, there is a need for a data-driven “state of OA†overview that is (a) large-scale, (b) up-to-date, and (c) reproducible. This paper attempts to provide such an overview, using a new open web service called oaDOI that finds links to legally-available OA scholarly articles.<a class="xref xref-fn" href="#fn-1" data-jats-ref-type="fn" data-jats-rid="fn-1"><sup>1</sup></a> Building on data provided by the oaDOI service, we answer the following questions:</p>
+ <ol class="list" id="list-1" data-jats-list-type="order">
+ <li class="list-item">
+<p id="p-4">What percentage of the scholarly literature is OA, and how does this percentage vary according to publisher, discipline, and publication year?</p>
+ </li>
+ <li class="list-item">
+<p id="p-5">Are OA papers more highly-cited than their toll-access counterparts?</p>
+ </li>
+ </ol>
+ <p id="p-6">The next section provides a brief review of the background literature for this paper, followed by a description of the datasets and methods used, as well as details on the definition and accuracy of the oaDOI categorization. Results are then presented, in turn, for each research question, and are followed by a general discussion and conclusions.</p>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Literature Review</h2>
+ <p id="p-7">Fifteen years of OA research have produced a significant body of literature, a complete review of which falls outside the scope of this paper (for recent, in-depth reviews, see <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.8460.3" title="The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)" data-jats-ref-type="bibr" data-jats-rid="ref-46">Tennant et al. (2016)</a> and <a class="xref xref-bibr" href="https://doi.org/10.7554%2FeLife.16800" title="How open science helps researchers succeed" data-jats-ref-type="bibr" data-jats-rid="ref-36">McKiernan et al. (2016)</a>. Here we instead briefly review three major topics from the OA literature: defining OA and its subtypes, assessing the prevalence of OA, and examining the relative citation impact of OA.</p>
+ <p id="p-8">Despite the large literature on OA, the term itself remains “somewhat fluid†(Antelman, 2004), making an authoritative definition challenging. The most influential definition of OA comes from the 2002 Budapest Open Access Initiative (BOAI), and defines OA as making content both <i>free to read</i> and <i>free to reuse</i>, requiring the opportunity of OA users to “crawl (articles) for indexing, pass them as data to software, or use them for any other lawful purpose.†In practice, the BOAI definition is roughly equivalent to the popular “CC-BY†Creative Commons license (<a class="xref xref-bibr" href="https://creativecommons.org/licenses/by/4.0/" title="Attribution 4.0 International (CC BY 4.0)" data-jats-ref-type="bibr" data-jats-rid="ref-19">Creative Commons, 2018</a>). However, a number of other sources prefer a less strict definition, requiring only that OA “makes the research literature free to read online†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003" title="The nine flavours of open access scholarly publishing" data-jats-ref-type="bibr" data-jats-rid="ref-51">Willinsky, 2003</a>), or that it is “digital, online, [and] free of charge.†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009" title="Status of open access in the biomedical field in 2005" data-jats-ref-type="bibr" data-jats-rid="ref-34">Matsubayashi et al., 2009</a>). Others have suggested it is more valuable to think of OA as a spectrum (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2016.1182672" title="Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool" data-jats-ref-type="bibr" data-jats-rid="ref-17">Chen &amp; Olijhoek, 2016</a>).</p>
+ <p id="p-9">Researchers have identified a number of subtypes of OA; some of these have near-universal support, while others remain quite controversial. We will not attempt a comprehensive list of these, but instead note several that have particular relevance for the current study.</p>
+ <ul class="list" id="list-2" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-10">Libre OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): extends user’s rights to read and also to reuse literature for purposes like automated crawling, archiving, or other purposes. The Libre OA definition is quite similar to the BOAI definition of OA.</p>
+ </li>
+ <li class="list-item">
+<p id="p-11">Gratis OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): in contrast to Libre, Gratis extends <i>only</i> rights to read articles.</p>
+ </li>
+ <li class="list-item">
+<p id="p-12">Gold OA: articles are published in an “OA journal,†a journal in which all articles are open directly on the journal website. In practice, OA journals are most often defined by their inclusion in the Directory of Open Access Journals (DOAJ) (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>; <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al., 2012</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-13">Green OA: Green articles are published in a toll-access journal, but self-archived in an OA archive. These “OA archives†are either disciplinary repositories like ArXiv, or “institutional repositories (IRs) operated by universities, and the archived articles may be either the published versions, or electronic preprints (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>). Most Green OA articles do not meet the BOAI definition of OA since they do not extend reuse rights (making them Gratis OA).</p>
+ </li>
+ <li class="list-item">
+<p id="p-14">Hybrid OA: articles are published in a subscription journal but are immediately free to read under an open license, in exchange for an an article processing charge (APC) paid by authors (<a class="xref xref-bibr" href="https://doi.org/10.1241%2Fjohokanri.41.678" title="Free internet access to traditional journals" data-jats-ref-type="bibr" data-jats-rid="ref-50">Walker &amp; Soichi, 1998</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-15">Delayed OA: articles are published in a subscription journal, but are made free to read after an embargo period (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=Willinsky&amp;publication_year=2009" title="The access principle: the case for open access to research and scholarship" data-jats-ref-type="bibr" data-jats-rid="ref-52">Willinsky, 2009</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-16">Academic Social Networks (ASN): Articles are shared by authors using commercial online social networks like ResearchGate and Academia.edu. While some include these in definitions of OA (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>), others argue that content shared on ASNs is not OA at all. Unlike Green OA repositories, ASNs do not check for copyright compliance, and therefore as much as half their content is illegally posted and hosted (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-017-2291-4" title="Copyright compliance and infringement in ResearchGate full-text journal articles" data-jats-ref-type="bibr" data-jats-rid="ref-30">Jamali, 2017</a>). This raises concerns over the persistence of content, since, as was the case in October 2017, publishers can and do issue large-scale takedown notices to ASN ordering the removal of infringing content (<a class="xref xref-bibr" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement" title="Publishers take ResearchGate to court, alleging massive copyright infringement" data-jats-ref-type="bibr" data-jats-rid="ref-15">Chawla, 2017</a>). Others have raised questions about the sustainability and ethics of ASN services themselves (<a class="xref xref-bibr" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html" title="A social networking site is not an open access repository" data-jats-ref-type="bibr" data-jats-rid="ref-22">Fortney &amp; Gonder, 2015</a>). Due to these concerns, and inconsistent support from the literature, we exclude ASN-hosted content from our definition of OA.<a class="xref xref-fn" href="#fn-2" data-jats-ref-type="fn" data-jats-rid="fn-2"><sup>2</sup></a> </p>
+ </li>
+ <li class="list-item">
+<p id="p-18">“Black OAâ€: Articles shared on illegal pirate sites, primarily Sci-Hub and LibGen. Although (<a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1096" title="Gold, green, and black open access" data-jats-ref-type="bibr" data-jats-rid="ref-10">Björk, 2017</a>) labels these articles as a subtype of OA, the literature has nearly no support for including Sci-Hub articles in definitions of OA. Given this, we exclude Sci-Hub and LibGen content from our definition of OA.</p>
+ </li>
+ </ul>
+ <p id="p-19">Based on the consensus (and in some cases, lack of consensus) around these definitions and subtypes, we will use the following definition of OA in the remainder of this paper: <b>OA articles are free to read online, either on the publisher website or in an OA repository.</b></p>
+ <section class="sec">
+ <h3 class="heading">Prevalence of OA</h3>
+ <p id="p-20">Many studies have estimated what proportion of the literature is available OA, including <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0011273" title="Open access to the scientific journal literature: situation 2009" data-jats-ref-type="bibr" data-jats-rid="ref-12">Björk et al. (2010)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1186%2F1741-7015-10-124" title="Anatomy of open access publishing: a study of longitudinal development and internal structure" data-jats-ref-type="bibr" data-jats-rid="ref-31">Laakso &amp; Björk (2012)</a>, <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al. (2012)</a>, <a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al. (2013)</a>, <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> and <a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen (2013)</a>. We are not aware of any studies since 2014. The most recent two analyses estimate that more than 50% of papers are now freely available online, when one includes both OA and ASNs. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a>, the most comprehensive study to date, estimates that of papers published between 2011 and 2013, 12% of articles could be retrieved from the journal website, 6% from repositories, and 31% by other mechanisms (including ASNs). <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> also found that the availability of papers published between 1996 and 2011 increased by 4% between April 2013 and April 2014, noting that “backfilling†is a significant contributor to green OA. Their discipline-level analysis confirmed the findings of other studies, that the proportion of OA is relatively high in biomedical research and math, while notably low in engineering, chemistry, and the humanities.</p>
+ <p id="p-21">This <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> study is of particular interest because it used automated web scraping to find and identify OA content; most earlier efforts have relied on laborious manual checking of the DOAJ, publisher webpages, Google, and/or Google Scholar (though see <a class="xref xref-bibr" href="http://arxiv.org/abs/cs/0606079" title="Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact" data-jats-ref-type="bibr" data-jats-rid="ref-27">Hajjem, Harnad &amp; Gingras (2006)</a> for a notable early exception). By using automated methods, Archambault et al. were able to sample hundreds of thousands of articles, greatly improving statistical power and supporting more nuanced inferences. Moreover, by creating a system that indexes OA content, they address a major concern in the world of OA research; as <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a> observes: “A major challenge for research...has been the lack of comprehensive indexing for both OA journals and their articles.†The automated system of <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> is very accurate—it only misclassifies a paper as OA 1% of the time, and finds about 75% of all OA papers that exist online, as per <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. However, the algorithm is not able to distinguish Gold from Hybrid OA. More problematically for researchers, the database used in the study is not open online for use in follow-up research. Instead, the data has since been used to build the commercial subscription-access database 1science (<a class="ext-link" href="http://www.1science.com/oanumbr.html" data-jats-ext-link-type="uri">http://www.1science.com/oanumbr.html</a>).</p>
+ </section>
+ <section class="sec">
+ <h3 class="heading">The open access citation advantage</h3>
+ <p id="p-22">Several dozen studies have compared the citation counts of OA articles and toll-access articles. Most of these have reported higher citation counts for OA, suggesting a so-called “open access citation advantage†(OACA); several annotated bibliographies have been created to track this literature (<a class="xref xref-bibr" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/" title="The open access citation advantage: list of studies until 2015" data-jats-ref-type="bibr" data-jats-rid="ref-43">SPARC Europe, 2015</a>; <a class="xref xref-bibr" href="https://doi.org/10.5062%2FF4Q81B0W" title="Open access citation advantage: an annotated bibliography" data-jats-ref-type="bibr" data-jats-rid="ref-49">Wagner, 2010</a>; <a class="xref xref-bibr" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D" title="The open access citation advantage" data-jats-ref-type="bibr" data-jats-rid="ref-45">Tennant, 2017</a>). The OACA is not universally supported. Many studies supporting the OACA have been criticised on methodological grounds (<a class="xref xref-bibr" href="https://doi.org/10.3163%2F1536-5050.99.3.008" title="The impact of free access to the scientific literature: a review of recent research" data-jats-ref-type="bibr" data-jats-rid="ref-21">Davis &amp; Walters, 2011</a>), and an investigation using the randomized-control trial method failed to find evidence of an OACA (<a class="xref xref-bibr" href="https://doi.org/10.1096%2Ffj.11-183988" title="Open access, readership, citations: a randomized controlled trial of scientific journal publishing" data-jats-ref-type="bibr" data-jats-rid="ref-20">Davis, 2011</a>). However, recent investigations using robust methods have continued to observe an OACA. For instance, <a class="xref xref-bibr" href="https://doi.org/10.1111%2Fecin.12064" title="Identifying the effect of open access on citations using a panel of science journals" data-jats-ref-type="bibr" data-jats-rid="ref-35">McCabe &amp; Snyder (2014)</a> used a complex statistical model to remove confounding effects of author selection (authors may selectively publish their higher-impact work as OA), reporting a small but meaningful 8% OACA. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> describe a 40% OACA in a massive sample of over one million articles using field-normalized citation rates. <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0159614" title="The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)" data-jats-ref-type="bibr" data-jats-rid="ref-38">Ottaviani (2016)</a> used a natural experiment as articles (not selected by authors) emerged from embargoes to become OA, and reports a 19% OACA excluding the author self-selection bias for older articles outside their prime citation years.</p>
+ </section>
+ </section>
+ <section class="sec" id="methods">
+ <h2 class="heading">Methods</h2>
+ <section class="sec">
+ <h3 class="heading">OA determination</h3>
+ <section class="sec">
+ <h4 class="heading">Classifications</h4>
+ <p id="p-23">We classify publications into two categories, OA and Closed. As described above, we define OA as <i>free to read online, either on the publisher website or in an OA repository</i>; all articles not meeting this definition were defined as Closed. We further divide the OA literature into one of four exclusive subcategories, resulting in a five-category classification system for articles:</p>
+ <ul class="list" id="list-3" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-24"><b>Gold</b>: Published in an open-access journal that is indexed by the DOAJ.</p>
+ </li>
+ <li class="list-item">
+<p id="p-25"><b>Green</b>: Toll-access on the publisher page, but there is a free copy in an OA repository.</p>
+ </li>
+ <li class="list-item">
+<p id="p-26"><b>Hybrid</b>: Free under an open license in a toll-access journal.</p>
+ </li>
+ <li class="list-item">
+<p id="p-27"><b>Bronze</b>: Free to read on the publisher page, but without an clearly identifiable license.</p>
+ </li>
+ <li class="list-item">
+<p id="p-28"><b>Closed</b>: All other articles, including those shared only on an ASN or in Sci-Hub.</p>
+ </li>
+ </ul>
+ <p id="p-29">These categories are largely consistent with their use throughout the OA literature, although a few clarifications are useful. First, we (like many other OA studies) do not include ASN-hosted content as OA. Second, categories are exclusive, and publisher-hosted content takes precedence over self-archived content. This means that if an article is posted in both a Gold journal and an OA repository, we would classify it as Gold, not Green. Put another way, publisher-hosted content can “shadow†archived articles that would otherwise be Green. This definition of Green (“available in a repository but <i>not</i> available from the publisherâ€) is often used in the OA literature (including by Steven Harnad, the coiner of the Green and Gold terms <a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>), but this usage is not unanimous. Some studies allow a given article to be <i>both</i> Gold and Green; compared to these, our classification system does undercount Green. Hybrid articles share properties with Gold articles (both are free to read and are licensed for re-use), but differ in the venue of publication (i.e., Hybrid articles are published in journals not considered open access by the DOAJ) and in that Hybrid articles are not necessarily immediately available (i.e., they may only be freely available after an embargo). We also add a novel subcategory, Bronze. Bronze shares attributes of Gold and Hybrid; like both, Bronze OA articles are publisher-hosted. Unlike Gold OA, Bronze articles are not published in journals considered open access in the DOAJ. Unlike Hybrid, Bronze articles carry no license information. Although this lack of identifiable license may not be intentional, without an identifiable license, the articles are free to read but do not allow extended reuse rights beyond reading. It is also not clear if Bronze articles are temporarily or permanently available to read for free.</p>
+ <p id="p-30">Finally, we should add that, although our categories of choice reflect the OA literature, they do not necessarily reflect the more complex reality of scholarly publishing today. Organizations like SciELO and Redalyc in Latin America have been acting simultaneously as publishers and repositories and many of the articles found on their site do not fall neatly into the above categories (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010" title="The SciELO open access: a gold way from the south" data-jats-ref-type="bibr" data-jats-rid="ref-39">Packer, 2010</a>).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">The oaDOI system</h4>
+ <p id="p-31">We assigned the categories above by calling the oaDOI service with a DOI for each item. The oaDOI returns a link to a legally-available OA version of the article, when one is available (<a class="ext-link" href="https://oadoi.org/" data-jats-ext-link-type="uri">https://oadoi.org/</a>). It contains records for all 88 million Crossref DOIs.<a class="xref xref-fn" href="#fn-3" data-jats-ref-type="fn" data-jats-rid="fn-3"><sup>3</sup></a> The oaDOI service crawls, aggregates, normalizes, and verifies data from many sources including PMC (<a class="ext-link" href="https://www.ncbi.nlm.nih.gov/pmc/" data-jats-ext-link-type="uri">https://www.ncbi.nlm.nih.gov/pmc/</a>), BASE (<a class="ext-link" href="https://www.base-search.net/about/en/" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/</a>), DOAJ (<a class="ext-link" href="https://doaj.org/" data-jats-ext-link-type="uri">https://doaj.org/</a>), and thousands of institutional repositories and publishers. The oaDOI system offers a fast, free API with no rate-limits, allowing it to support a variety of other services and tools. At the time of writing, oaDOI processes approximately 500,000 requests daily–roughly twice the daily uses of Sci-Hub<a class="xref xref-fn" href="#fn-4" data-jats-ref-type="fn" data-jats-rid="fn-4"><sup>4</sup></a> (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1" title="Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)" data-jats-ref-type="bibr" data-jats-rid="ref-29">Himmelstein et al., 2017</a>). The majority of this volume comes from around 700 academic libraries, who use oaDOI to help readers find articles where the library has no subscription access, addressing the discoverability problem (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen, 2013</a>). The oaDOI service also powers the Unpaywall browser extension, which helps readers to find legal OA copies of paywalled articles as they browse; Unpaywall currently has over 80,000 active users. The oaDOI codebase is open source, and the service is free and open via an open API.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Accuracy of oaDOI</h4>
+ <p id="p-34">To assess the accuracy of our automated OA determination, a random subsample of 500 articles were chosen from our main “Crossref-DOI†sample, described below. We manually searched the internet for each article in our subsample to determine if the paper was freely available on the publisher’s website, or on another website, such as an institutional repository, an academic social networking site, or on a personal webpage. DOIs were resolved by appending the DOI to “<a class="ext-link" href="https://doi.org/" data-jats-ext-link-type="uri">https://doi.org/</a>â€. If the full text was available through that link, articles were marked as being freely available from the publisher’s site. If articles required a subscription, the title of the article was entered into Google Scholar (GS) and into Google to find alternative versions (i.e., preprints or archived copies). If the fulltext was found on any publisher page or OA repository, these were marked as being freely available from an archive. If the only available open copy was hosted on an academic social network (like Academia.edu or ResearchGate), this was noted but for the sake of the study these were <i>not</i> counted as any category of OA, and were instead added to the “Closed†category;</p>
+ <p id="p-35">The performance of oaDOI is summarized below, compared to these manual accuracy checks. The complete dataset behind this summary is available in supplementary information. Using this data we calculated the recall and precision of the system. “Recall†asks the question, “when an article is open, how often does oaDOI correctly identify it as open?†The recall of the service is 77.0%, meaning that 77% of the truly open articles are correctly identified as open by oaDOI. “Precision†asks the question, “When oaDOI says an article is open, how often is it correct?†The precision of the system is 96.6%, meaning that 96.6% of the time that oaDOI reports an article is open, it really is open.</p>
+ <p id="p-36">These results can be roughly compared to the recall of 86.4% and precision of 99.1% reported by <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> for their automated system. Their accuracy estimate was also calculated based on a sample of 500 data points, giving each estimate a margin of error of ±4.5 percentage points. The Archambault study used a narrower date window for their sample (starting in 1996, versus our Crossref-DOI sample which was not time restricted), resulting in a more homogeneous task, which may partially explain their somewhat better performance.</p>
+ <p id="p-37">The oaDOI service is optimized for high precision, rather than high recall. The very high precision of oaDOI means that any estimates derived from the database can be considered a <i>conservative</i> estimate of the actual percentage of open access in the literature. That is, we can safely assume that when oaDOI reports a certain percentage of open access, the real percentage is <i>at least</i> that high—and almost certainly higher given that recall was less than perfect. Put another way, oaDOI delivers very few false positives (where it mistakenly calls an article open), but a relatively high number of false negatives (where it mistakenly calls an article closed) (<a class="xref xref-table" href="#table-1" data-jats-ref-type="table" data-jats-rid="table-1">Table 1</a>). Future improvements to the system are planned that will improve recall while keeping precision high.</p>
+ <figure class="table-wrap" id="table-1"><div class="caption">
+<span class="caption-label">Table 1: </span>
+ <div class="title">Accuracy of the prototype version of the oaDOI service used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th></th>
+ <th>oaDOI reports Open</th>
+ <th>oaDOI reports Closed</th>
+ <th>Manual count Total (ground truth)</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Open</td>
+ <td>144</td>
+ <td>43</td>
+ <td>187</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>5</td>
+ <td>308</td>
+ <td>313</td>
+ </tr>
+ <tr>
+ <td>Total</td>
+ <td>149</td>
+ <td>351</td>
+ <td style="text-align:left;;">500</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-1</a>
+</div>
+ </figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">Study samples</h3>
+ <p id="p-38">Three samples of DOI-assigned scholarly resources are summarized in <a class="xref xref-table" href="#table-2" data-jats-ref-type="table" data-jats-rid="table-2">Table 2</a> and described further below.</p>
+ <section class="sec">
+ <h4 class="heading">Crossref sample</h4>
+ <p id="p-39">The first sample, “Crossref-DOIs,†is a random sample of 100,000 journal articles with Crossref DOIs, across all publication years. There are approximately 88 million Crossref DOIs in total as of May 2017. In order to exclude books, datasets, and other non-article content, we sampled only items whose “type†was listed as “journal-article†in the Crossref API metadata; there are 66 million of these. To verify the accuracy of Crossref metadata, we manually checked 150 items assigned to type “journal-article,†and determined that 93% were indeed journal articles; the remaining 7% were mostly journal front-matter such as tables of content or instructions to authors.</p>
+ <figure class="table-wrap" id="table-2"><div class="caption">
+<span class="caption-label">Table 2: </span>
+ <div class="title">Summary of samples used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover table-text" data-jats-content-type="text">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Sample name</th>
+ <th>Sample size</th>
+ <th>Population sampled</th>
+ <th>Purpose</th>
+ <th>Population size</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Crossref-DOIs</td>
+ <td>100,000</td>
+ <td>All journal articles with Crossref DOIs, all years.</td>
+ <td>Estimate percentage of the literature that is OA.</td>
+ <td>66,560,153</td>
+ </tr>
+ <tr>
+ <td>WoS-DOIs</td>
+ <td>100,000</td>
+ <td>All citable WoS articles with DOIs, 2009–2015.</td>
+ <td>Estimate citation impact of recent OA papers, and also OA prevalence by discipline.</td>
+ <td>8,083,613</td>
+ </tr>
+ <tr>
+ <td>Unpaywall-DOIs</td>
+ <td>100,000</td>
+ <td>All articles accessed by Unpaywall users over a 1-week period in 2017.</td>
+ <td>Estimate percentage of OA experienced by users of the Unpaywall extension.</td>
+ <td>213,323</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-2</a>
+</div>
+ </figure>
+ <p id="p-40">The purpose of this sample is to roughly proxy the scholarly literature as a whole. As such, it has strengths and weaknesses. One weakness is that although Crossref includes information on citation counts and discipline categorization, we found these to be quite incomplete, and therefore not useful for the present study. Another is that researchers in the scientometrics and OA fields have largely relied on other indexes, particularly Scopus and Web of Science (WoS), to represent the literature as a whole; this makes our results more difficult to compare to previous work. Finally, DOIs are known to be less frequently assigned by publishers in certain disciplines (like humanities; <a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2015.11.008" title="Availability of digital object identifiers (DOIs) in web of science and scopus" data-jats-ref-type="bibr" data-jats-rid="ref-25">Gorraiz et al., 2016</a>), in certain geographic regions (particularly the developing world), and among older articles (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-016-2225-6" title="Availability of digital object identifiers in publications archived by PubMed" data-jats-ref-type="bibr" data-jats-rid="ref-14">Boudry &amp; Chartron, 2017</a>); consequently, these segments will be underrepresented in our sample. This said, Scopus and WoS are also known to underrepresent important segments of the literature (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-015-1765-5" title="The journal coverage of Web of Science and Scopus: a comparative analysis" data-jats-ref-type="bibr" data-jats-rid="ref-37">Mongeon &amp; Paul-Hus, 2016</a>), and so this failing is not limited to Crossref. Moreover, the Crossref sample has important advantages of its own over other indexes. While no sample of the scholarly literature will be complete in every regard, the Crossref index is more expansive than other sources: in July 2017 there were 67 million journal articles indexed in Crossref compared to 30 million in Scopus (<a class="ext-link" href="https://www.elsevier.com/solutions/scopus/content" data-jats-ext-link-type="uri">https://www.elsevier.com/solutions/scopus/content</a>). Also, Crossref has the advantage of being entirely free and open to use, while Scopus and WoS are subscription-access databases; this allows the study data to also be free and open, promoting replication and reuse of our results in further research. However, we did turn to the subscription-access WoS in order to answer questions about the discipline and citation counts of OA articles, since Crossref data is lacking in these areas.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">WoS sample</h4>
+ <p id="p-41">The second sample, “WoS-DOIsâ€, is a random sample of 100,000 journal articles with DOIs that are indexed by Web of Science. The sample was drawn from a local version of the WoS database at the Observatoire des sciences et des technologies (OST) at the Université du Québec à Montréal. Only articles that WoS defines as “citable items†are included in the sample; this excludes non-peer reviewed content such as editorial material and news items. This sample is restricted to articles published between 2009 and 2015, due to DOI availability constraints. The sample of 100,000 articles is randomly drawn from a population of 8 million articles and reviews with a DOI in WoS published between 2009 and 2015 as of May 2017.</p>
+ <p id="p-42">Because the WoS sample is restricted to certain publication years, due to availability of DOIs in the WoS database, this sample is unsuitable for estimating the proportion of the total literature that is OA. However, it is more useful than the Crossref sample in some ways: the WoS sample included accurate discipline information for each article (described below), and also citation counts. Therefore we use the WoS sample to assess OA prevalence by discipline and also the citation impact of recent OA papers. We do not encourage comparisons between the OA percentages in the WoS sample and the Crossref sample, because of large differences in the sampling frames.</p>
+ <p id="p-43">Documents in the WoS-DOIs sample were classified using the National Science Foundation (NSF) journal classification system. This system assigns every journal exactly one “discipline†(a high-level categorization) and exactly one “specialty†(a finer-grained categorization). Because this is a journal-level classification, all articles from a given journal are assigned the same discipline and specialty as the journal. A downside of this approach is that the system classifies multidisciplinary journals (e.g., Nature, PNAS, PLOS ONE) as “biomedical researchâ€, despite their publishing many articles from other fields.<a class="xref xref-fn" href="#fn-5" data-jats-ref-type="fn" data-jats-rid="fn-5"><sup>5</sup></a> In these cases, we used a ground-up, article-by-article classification approach. Each article published in a list of multidisciplinary journals was assigned to the NSF specialty which appeared most frequently in its own reference list. In other words, papers published in multidisciplinary journals were classified at the article level (instead of at the journal level) to the subject area which they cite most frequently.<a class="xref xref-fn" href="#fn-6" data-jats-ref-type="fn" data-jats-rid="fn-6"><sup>6</sup></a> </p>
+ <p id="p-46">We assess the relative impact of open and closed articles, using citations as an indicator of their scholarly impact. There are several properties of articles, however, that can confound this kind of comparison. Chief among these are the article’s discipline (some fields are much more cited than others) and its age (older articles have had more time to gather citations). In order to address this, we computed a normalized expected number of citations for each article, based on its age and its NSF specialty, by comparing it to the average citations for similar articles.<a class="xref xref-fn" href="#fn-7" data-jats-ref-type="fn" data-jats-rid="fn-7"><sup>7</sup></a> </p>
+ <p id="p-48">Using this approach, each article receives an average relative citation (ARC). An ARC of 1.0 indicates that a document was cited according to expectations based on documents published in the same year and NSF specialty, while an ARC above or below 1.0 indicates that the citation impact was above or below world average, respectively. Using these field-normalized citation rates, citation impact can be compared across scientific disciplines as well as across years. We can also compute mean ARCs for groups of articles, like “all open articles†or “all closed articlesâ€, allowing us to compare normalized impact between these two groups. Analyzing results on the level of NSF disciplines, data is not shown for the Humanities (<i>n</i> = 1,091) and Arts (<i>n</i> = 164), because they are underrepresented both in the Web of Science and in terms of DOI coverage.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Unpaywall sample</h4>
+ <p id="p-49">The third sample, “Unpaywall-DOIsâ€, is a random sample of 100,000 articles accessed by users of the free, open-source Unpaywall browser extension, gathered over a one-week time window. We collected IP addresses and DOI requests made to the oaDOI service through the Unpaywall browser extension during the week of June 5–June 11, 2017. In that time period there were 374,703 total accesses, 213,323 unique DOIs, and 42,894 unique IP addresses gathered in total, from which 100,000 unique DOIs were randomly sampled.</p>
+ <p id="p-50">This sample was used to assess the prevalence of OA experienced by users of the Unpaywall extension (since Unpaywall uses oaDOI data to find OA). It is a convenience sample of what articles people are interested in reading, and thereby lets us roughly estimate the percent of this literature that is OA. The sample has serious limitations, however: we don’t know the demographics of Unpaywall users, and we are aware of a bias towards users from the US (as determined by the IP addresses). As such, we cannot accurately generalize the results by education level, discipline, or purpose in reading the scholarly literature.</p>
+ </section>
+ </section>
+ </section>
+ <section class="sec" id="results">
+ <h2 class="heading">Results</h2>
+ <section class="sec">
+ <h3 class="heading">RQ1. What percent of the literature is open access?</h3>
+ <section class="sec">
+ <h4 class="heading">How much of the literature is OA?</h4>
+ <p id="p-51">We found 27.9% (95% CI [27.6–28.2]) of all DOI-assigned journal articles are OA, using the Crossref-DOI sample. Based on this, we estimate there are 18.6 million OA articles with Crossref DOIs (95% CI [18.4–18.8]). This is the total population of OA articles that can be identified and accessed by oaDOI. Given our finding (described in Methods above) that the oaDOI service finds 77% of OA compared to manual searches, we can further estimate that an additional 3.5 million articles are OA but not detectable by this version of oaDOI.</p>
+ <p id="p-52">People reading the literature using the Unpaywall browser extension encounter a significantly higher proportion of OA: we found that 47.0% (95% CI [46.7–47.3]) of the Unpaywall-accessed sample is open access. The main reason for this is article age: since this sample is based on the behavior of actual readers, it is disproportionately comprised of recent articles. In fact, half the accessed articles were published in the last 2 years. Recent articles are much more likely to be OA than their older counterparts (see Results ‘How does Open Access vary by year of publication?’ below).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">What types of Open Access are most common?</h4>
+ <p id="p-53">The proportion of OA by subtype is relatively similar across the samples, as shown in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a> and <a class="xref xref-table" href="#table-3" data-jats-ref-type="table" data-jats-rid="table-3">Table 3</a>. Green OA represents a relatively small percentage of OA articles in all three samples. This is partly because self-archived articles are only counted as Green where there is no publisher-hosted option available; that is, Green OA is sometimes “shadowed†by Gold, Bronze, or Hybrid articles. Bronze is the most common OA subtype in all the samples, which is particularly interesting given that few studies have highlighted its role. We manually inspected a small sample of Bronze articles in order to understand this subcategory more; we found that while many Bronze articles were Delayed OA from toll-access publishers, nearly half were hosted on journals that published 100% of content as free-to-read but were <i>not</i> listed on the DOAJ and did not formally license content (using CC-BY or any other license). Such journals might be better described as “Dark Gold†or “Hidden Gold†than Bronze. A more complete examination of Bronze falls outside the scope of this study, and therefore further investigation will be undertaken in future work.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-1"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 1: Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-small.jpg 355w" data-image-id="fig-1" alt="Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="230"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 1: </span>Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" class="btn btn-mini" download="peerj-4375-fig-1.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-1</a>
+</div>
+</div></figcaption></figure>
+ <figure class="table-wrap" id="table-3"><div class="caption">
+<span class="caption-label">Table 3: </span>
+ <div class="title">Percent of the literature that is OA, by type, in three samples of 100,000 journal articles, with 95% confidence intervals.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Access type</th>
+ <th style="text-align:center;" colspan="2">Crossref-DOI All journal articles with Crossref DOIs, all years. (“Articles with DOIs†in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a>)</th>
+ <th style="text-align:center;" colspan="2">WoS-DOIs All citable WoS articles with DOIs, 2009–2015</th>
+ <th style="text-align:center;" colspan="2">Unpaywall-DOIs All articles accessed by Unpaywall users over a 1-week period in 2017</th>
+ </tr>
+ <tr>
+ <th></th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>OA (all types)</td>
+ <td>27.9%</td>
+ <td>27.6–28.2</td>
+ <td>36.1%</td>
+ <td>36.0–36.2</td>
+ <td>47.0%</td>
+ <td>46.7–47.3</td>
+ </tr>
+ <tr>
+ <td>Bronze OA</td>
+ <td>16.2%</td>
+ <td>16.0–16.5</td>
+ <td>12.9%</td>
+ <td>12.6–13.2</td>
+ <td>15.3%</td>
+ <td>15.0–15.6</td>
+ </tr>
+ <tr>
+ <td>Hybrid OA</td>
+ <td>3.6%</td>
+ <td>3.3–3.9</td>
+ <td>4.3%</td>
+ <td>4.0–4.6</td>
+ <td>8.3%</td>
+ <td>8.0–8.6</td>
+ </tr>
+ <tr>
+ <td>Gold OA</td>
+ <td>3.2%</td>
+ <td>2.9–3.5</td>
+ <td>7.4%</td>
+ <td>7.1–7.7</td>
+ <td>14.3%</td>
+ <td>14.0–14.6</td>
+ </tr>
+ <tr>
+ <td>Green OA</td>
+ <td>4.8%</td>
+ <td>4.5–5.1</td>
+ <td>11.5%</td>
+ <td>11.2–11.8</td>
+ <td>9.1%</td>
+ <td>8.8–9.4</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>72.0%</td>
+ <td>71.8–72.4</td>
+ <td>63.9%</td>
+ <td>63.8–64.0</td>
+ <td>53.0%</td>
+ <td>52.7–53.3</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-3</a>
+</div>
+ </figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by year of publication?</h4>
+ <p id="p-54"><a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Figure 2</a> presents the number (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2A</a>) and proportion (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2B</a>) of papers by access category and publication date. Articles published in the last 20 years are increasingly OA, and this trend shows no sign of slowing. More recent articles are more likely to be OA, with the most recent year examined also containing the most OA: 44.7% of 2015 articles are OA (95% CI [43.3–46.2%]), including 17.6% Bronze (95% CI [16.2–19.1]), 9.4% Hybrid (95% CI [8.0–10.9]), 11.3% Gold (95% CI [9.9–12.8]), and 6.3% Green (95% CI [4.9–7.8]). Well over one million OA papers were published in 2015. This growth trend has largely been driven by dramatic growth in Gold and Hybrid OA since the year 2000. However, more than 20% of papers published before the digital age are also freely available. The majority of these older OA papers are Bronze, and based on their age they are probably more precisely Delayed OA, although additional investigation will be required to confirm this. Bronze OA remains remarkably constant as a proportion of the literature for all publication years examined.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-2"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 2: Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-small.jpg 355w" data-image-id="fig-2" alt="Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="216"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 2: </span>Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" class="btn btn-mini" download="peerj-4375-fig-2.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-2</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-55">The number and proportion of Green papers must be interpreted with particular caution, due to several factors. First, unlike publisher-hosted OA (Gold, Bronze, and Hybrid), the date when the Green article <i>became open</i> is generally different from the date the article was <i>first published</i>. Authors often self-archive articles years after (or before, in the case of preprints) their original publication, leading to so-called “backfilling†of Green stocks (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>). Consequently, the graph cannot show the growth of Green OA over time; this would require longitudinal analysis over several years, and so is outside the scope of this analysis. Instead it shows the number and proportion of Green OA by publication year of the article. Second, many articles cannot be legally self-archived until a certain number of months after publication; this embargoing likely influences the apparent plateau in Green shown in <a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2</a>. Finally, as noted earlier, many self-archived articles would otherwise be Green except for being “shadowed†by a Gold, Bronze, or Hybrid of the same article elsewhere. For more detail on the growth of shadowed Green OA, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Figs. SA2</a> and <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">SA3</a>.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by publisher?</h4>
+ <p id="p-56">We analyzed a subset of the Crossref-DOIs sample by publisher (as listed on the Crossref metadata record) to understand how the extent and types of OA are common across publishers for recent publications (between 2009 and 2015). As we can see in <a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3A</a>, the largest publishers by volume publish the most OA articles by volume, led by Elsevier. As a proportion of all articles published (<a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3B</a>), however, PLOS and Hindawi distinguish themselves as being the only publishers in the top 20 with 100% OA. More than half of the papers published by Oxford University Press, Nature Publishing Group, IOP Publishing, and the American Physical Society (APS) are freely available online. In the case of APS this is largely driven by content available through repositories such as arXiv (for more details on repositories, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>).</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-3"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 3: Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-small.jpg 355w" data-image-id="fig-3" alt="Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="282"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 3: </span>Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" class="btn btn-mini" download="peerj-4375-fig-3.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-3</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary across disciplines?</h4>
+ <p id="p-57">We used the WoS-DOIs sample to examine OA prevalence differences by discipline, because of the easy availability of discipline metadata in the WoS index. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> displays our results. More than half of the publications are freely available in biomedical research and mathematics, while in chemistry and engineering &amp; technology less than 20% of the papers are freely available. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> also highlights the popularity of Green OA in disciplines like physics and mathematics, where more than one fifth of papers are available only through online repositories (mainly arXiv). Hybrid articles are particularly prevalent in mathematics (9.4%), biomedical research (8.1%) and clinical medicine (6.3%), while authors in biomedical research (15.3%), health (11.7%), mathematics (11.2%) and clinical medicine (10.3%) often publish in Gold journals.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-4"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 4: Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-small.jpg 355w" data-image-id="fig-4" alt="Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="241"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 4: </span>Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities).</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" class="btn btn-mini" download="peerj-4375-fig-4.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-4" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-4</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-58">Large variations can also be observed on the more detailed level of NSF specialties (<a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA5</a>). At more than 80% of OA articles, astronomy &amp; astrophysics (87%), fertility (86%), tropical medicine (84%), and embryology (83%) were the specialties where access to literature was the most open. At the other end of the spectrum are pharmacy (7%), inorganic &amp; nuclear chemistry (7%), and chemical engineering (9%), where publications were hidden behind a paywall for more than 90% of papers. More detail on these and other NSF specialties can be seen in <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>.</p>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">RQ2. What is the scholarly impact of open access?</h3>
+ <p id="p-59">Comparing the average relative citation impact of different access categories, the OACA is corroborated: Papers hidden behind a paywall were cited 10% below world average (ARC = 0.90), while those that are freely available obtain, on average, 18% more citations than what is expected (ARC = 1.18). However, citation impact differs between the different manners in which papers are made available for free: those that are only available as Green OA (ARC = 1.33) and Hybrid OA papers (ARC = 1.31) are cited the most with an impact of more than 30% above expectations, those available as Bronze are cited 22% above world average, while papers published as Gold OA obtain an ARC of 0.83. This constitutes an average relative citation impact of 17% below world average and 9% below that of articles hidden behind a paywall. <a class="xref xref-fig" href="#fig-5" data-jats-ref-type="fig" data-jats-rid="fig-5">Figure 5</a> below describes these findings.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-5"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 5: Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w" data-image-id="fig-5" alt="Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="388"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 5: </span>Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" class="btn btn-mini" download="peerj-4375-fig-5.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-5" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-5</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-60">These trends vary over time, however, as shown in <a class="xref xref-fig" href="#fig-6" data-jats-ref-type="fig" data-jats-rid="fig-6">Fig. 6</a>. While the ARC of closed access papers remains below world average throughout the period studied, it increased from .86 in 2009 to .93 over in 2014 and 2015. Meanwhile, when looking across all open types, the mean citation rate is consistently above the world average, fluctuating between 1.15 and 1.22. This fluctuation is guided by differences between the access types, with the impact of Hybrid OA papers increasing over the time period. While Green OA papers’ mean citation rate remain relatively stable, the highest impact, for 2015, is obtained by Bronze and Hybrid. The only form of open for which mean impact has decreased steadily over time is Gold. The results for more recent years are only based on a short citation window, however, and results might change over the next years as citations accumulate.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-6"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 6: Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-small.jpg 355w" data-image-id="fig-6" alt="Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="465"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 6: </span>Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" class="btn btn-mini" download="peerj-4375-fig-6.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-6" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-6</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Discussion and Conclusion</h2>
+ <p id="p-61">Access to scholarly literature is at the heart of current debates in the research community. Research funders are increasingly mandating OA dissemination to their grantees while, at the same time, the growth in toll-access subscriptions costs have prompted more and more university libraries to cancel subscriptions. In this context, several tools have been developed to provide access–both legally and illegally–to scholarly literature. Using data from one of these tools (oaDOI), this paper addresses two broad research questions: what percent of the literature is OA and how does it vary by type of OA, and what is the mean scholarly impact of papers diffused through this form. Three large samples were used, to assess different aspects of OA patterns: (1) 100,000 articles that have a Crossref DOIs, which allows us to assess the relative proportion of OA across all existing literature; (2) 100,000 WoS-indexed journals articles that have a DOI, which allows us to assess the scholarly impact of OA and non OA papers; (3) 100,000 articles accessed by users through the Unpaywall browser extension, which lets us assess the proportion of OA papers found by users of this free tool.</p>
+ <p id="p-62">We found that 28% of all journal articles are freely available online (Crossref-DOI sample). Encouragingly for proponents of OA, this proportion has been growing steadily over the last 20 years, driven particularly by growth in Gold and Hybrid. Articles from 2015, the most recent year examined, had the highest proportion OA (45%), as well as the largest absolute number of OA articles published in a single year. This disproportionate level of OA in recent years, combined with readers’ preference for more recent articles, leads to a felicitous situation for readers: the proportion of OA they <i>experience</i> as they browse and search is better than the overall percentage of OA across the literature as a whole. Users of the Unpaywall browser extension, which gives individual readers access to the oaDOI service, encounter OA articles nearly half (47%) of the time. The effect almost certainly extends beyond Unpaywall users; one may assume readers in general also favor newer articles, and therefore benefit from the growth of Gold, Bronze, and Hybrid OA among recent papers, even without using Unpaywall. More studies of readership data from other sources would be useful to quantify this further.</p>
+ <p id="p-63">Interestingly, we found that the majority of OA articles are Bronze–hosted on publisher websites, either without a license at all or without an open license. This is surprisingly high given that Bronze is relatively little-discussed in the OA literature, and suggests that this OA category deserves further attention from the OA community. In particular, Bronze OA may be significant in a policy context, since, unlike other publisher-hosted OA, Bronze articles do not extend any reuse rights beyond reading, making them Gratis OA. Much more research is needed into the characteristics of Bronze OA. How many Bronze articles are licensed openly, but do not make their license available? Is Bronze disproportionately non-peer-reviewed content? How much of Bronze OA is also Delayed OA? How much Bronze is Promotional, and how transient is the free-to-read status of this content? How many Bronze articles are published in “hidden gold†journals that are not listed in the DOAJ? Why are these journals not defining an explicit license for their content, and are there effective ways to encourage this? These and other questions are outside the scope of this study but may provide fruitful insights for future OA research and policy.</p>
+ <p id="p-64">Only about 7% of the literature overall (and 17% of the OA literature) is Green. This is may at first seem disappointing, given years of advocacy focused on Green OA as well as ongoing growth in the number of Green OA mandates (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014" title="Anatomy of green open access" data-jats-ref-type="bibr" data-jats-rid="ref-11">Björk et al., 2014</a>). However, the full context of Green OA provides reasons for optimism. First, many papers are archived in repositories but are not counted as Green in this analysis because they are also available on the publisher site as Hybrid, Gold, or Bronze versions. These “shadowed Green†copies provide a useful safety net that preserves access in cases where publishers rescind it (as could potentially happen with Delayed OA and other Bronze articles). Further research is needed to determine the prevalence of shadowed Green OA in various disciplines. Second, the phenomenon of “backfilling†(authors self-archiving content published across all years, not just the current one) means that although the percentage graph of Green OA does not show the same year-over-year slope as Gold or Hybrid, the line itself may be rising across <i>all</i> years as authors gradually self-archive papers from years or even decades ago. This assumption is supported by results reported by <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. Finally, the relatively low proportion of green OA encouragingly leaves room for continued growth. While most journals published by major publishers (Elsevier, Wiley, Springer, etc.) allow for self-archiving, research shows that only a small proportion of papers from these publishers actually are self-archived in OA repositories; for example, <a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=" title="Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature" data-jats-ref-type="bibr" data-jats-rid="ref-42">Smith et al. (in press)</a> report using a sample of Global Health Research papers that only 39% of them made use of available self-archiving rights.</p>
+ <p id="p-65">Our results confirm the Open Access Citation Advantage found by other studies: open articles receive 18% more citations than otherwise expected. While at least some of this boost is likely due to the fact that more access allows more people to read and hence cite articles they otherwise would not, causation is difficult to establish and there are many possible confounders. Most discussed is the so-called “selection bias postulateâ€, (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2007.04.001" title="Do open access articles have greater citation impact?" data-jats-ref-type="bibr" data-jats-rid="ref-18">Craig et al., 2007</a>) which suggests that authors choose only their most impactful work to make OA. The current study does not examine the cause or directionality of correlation, but does find that it exists in a very large sample that is relatively representative of the literature as a whole. Funder requirements may also play a role in the observed citation advantage: high-profile funders are more likely to have an OA publishing requirement; at the same time, well funded studies are independently more likely to receive more citations than poorly funded studies (<a class="xref xref-bibr" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/" title="Measuring the scientific output and impact of NIGMS grants" data-jats-ref-type="bibr" data-jats-rid="ref-7">Berg, 2010</a>). Interestingly, Gold articles are actually cited <i>less</i>, likely due to an increase in the number of newer and smaller OA journals. Some of these journals are from regions of the world not historically indexed by WoS, are published in languages other than English, or might be considered to be less prestigious because they have not had time to become established or accumulate citations (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>). On the flip side, the citation disadvantage of Gold OA is likely also affected by the continued growth of so-called ‘mega journals’ such as PLOS ONE (<a class="xref xref-bibr" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication" title="Reviewer guidelines: criteria for publication" data-jats-ref-type="bibr" data-jats-rid="ref-40"> PLOS, 2018</a>). Whatever the reason, the lower impact of Gold means the overall citation advantage is strongly driven by Green, Hybrid, and Bronze content. In sum, while several factors can affect the observed differences in citation rates, and causation remains difficult to establish, the fact remains that scholars are much more likely to read and cite papers to which they have access than those that they cannot obtain. Hopefully the existence of a free, open index of OA content will help support further research into the OACA question.</p>
+ <p id="p-66">The relatively high percentage of OA found in this study, particularly among readers of the free Unpaywall extension, has important potential implications for academic libraries. Increasingly, these libraries are under pressure to meet growing prices of “Big Deal†subscription packages, and the once-unthinkable outcome of canceling these Big Deals is becoming an increasingly realistic option. In this environment, knowing that around half of the literature of interest is available without any subscription may tip the scales toward cancellation for some institutions–particularly given that this percentage seems to be growing steadily. Indeed, the Université de Montréal’s cancellation of their Taylor &amp; Francis subscription package (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>) is particularly interesting, given that their cancellation announcement directly pointed faculty to Unpaywall and other tools to help them access OA content. This may seem a radical suggestion, but cancellation of subscription journals has long been part of the universal OA roadmap (<a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/" title="The forbidden forecast: thinking about open access and library subscriptions" data-jats-ref-type="bibr" data-jats-rid="ref-2">Anderson, 2017b</a>). Even when the percentage of OA is not enough to support outright cancellation, it may be enough to negotiate better subscription rates by supporting calculation of “OA-adjusted Cost Per Access†(<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>). However, much more study is needed to see how OA availability varies across journals and Big Deal packages, along with praxis-oriented work building OA analysis tools that help librarians make cancellation choices.</p>
+ <p id="p-67">This study has several important limitations. Our dataset only includes journal articles with DOIs, which means that disciplines and geographical areas which rely more heavily on conference papers or articles without DOIs are underrepresented. Our Crossref sample includes about 7% journal “front matter†that the journal has assigned a DOI and Crossref labelled “journal article†but is actually a page describing the journal Editorial Board or similar. Our Bronze OA category includes articles published in OA journals which aren’t indexed in DOAJ; future work must identify these OA journals and classify such articles as Gold. As discussed in our definition of OA, when finding open copies we ignored free-to-read articles from academic social networks like ResearchGate and Academia.edu. The oaDOI system has some coverage of articles published on personal web pages, but this is quite limited compared to web-scale indexes like Google. The oaDOI system includes thousands of institutional and subject repositories, but there are some repositories that it misses. Our accuracy checks suggest that oaDOI, and therefore this study, are probably overlooking around 23% of OA otherwise discoverable using web searches, meaning that estimates in reported in this paper undercount OA by approximately 30%. Finally, our approach did not detect <i>when</i> articles were deposited into repositories. Because repositories are often backfilled with content that has been published many years ago, this study does not measure any increase/decrease in prevalence of Green OA over time, but only the proportion of Green OA by article publication date at the moment of data collection.</p>
+ <p id="p-68">In addition to the empirical results obtained, this paper clearly shows the potential of the oaDOI service for future research. The freely available oaDOI service provides scholars with the basis for assessing and monitoring the development of access to scholarly literature on a large scale, as well as the factors that affect it. For instance, our results show that the percentage of the literature available as OA is growing, and that articles diffused through this form are generally more cited than closed access articles. Several factors are likely to contribute to these trends; however, those remain poorly understood. Combined with other datasets–such as the WoS, Scopus, or Crossref–oaDOI allows one to assess at a large-scale the effects of various mandates on deposit rates, or to track the development of documents’ accessibility to determine, for example, when authors self-archive, or the sustainability of the promotional OA category. Aggregated at the level of journals and publishing platforms, these data can also provide librarians with indicators to help inform subscription cancellations and mitigate their effects. The application of the oaDOI algorithm on a large scale also allows for more complete analysis of the OA citation advantage across fields and time. As in <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0013636" title="Self-selected or mandated, open access increases citation impact for higher quality research" data-jats-ref-type="bibr" data-jats-rid="ref-23">Gargouri et al. (2010)</a>, confounding factors could be mitigated by using article-level metadata to identify article pairs published in the same journal issue, on the same topic or published by the same authors at the same time. We hope that other scholars will dig deeper in those data to better understand OA dissemination and the factors that drive it. This is of utmost importance for the future of scholarly communication.</p>
+ </section>
+ <section class="sec" id="supplemental-information">
+ <h2 class="heading"> Supplemental Information</h2>
+ <div class="supplementary-material well well-small" id="supp-1" data-jats-mimetype="application" data-jats-mime-subtype="vnd.openxmlformats-officedocument.wordprocessingml.document">
+<h3 class="heading">Additional results</h3>
+
+ <div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/supp-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/supp-1</a>
+</div>
+<div><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/appendix.docx" class="btn article-supporting-download" data-rel="supplement" download="appendix.docx" data-filename="appendix.docx"><i class="icon-large icon-download-alt"> </i> Download</a></div>
+</div>
+ </section>
+ </div>
+<div id="article-footnotes">
+<div class="fn article-footnote" id="fn-1"><span class="p">In the interest of full disclosure, it should be noted that two of the authors of the paper are the co-founders of Impactstory, the non-profit organization that developed oaDOI.</span></div>
+<div class="fn article-footnote" id="fn-2"><span class="p">Repositories that were included are those covered by the Bielefeld Academic Search Engine (BASE) in May 2017. A full listing of repositories can be found on their website at: <a class="ext-link" href="https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1</a>
+ </span></div>
+<div class="fn article-footnote" id="fn-3"><span class="p">DOIs are short, unique identifiers for scholarly papers. Crossref is a nonprofit that helps a the DOI system, and is by far the largest supplier of academic DOIs in academia.</span></div>
+<div class="fn article-footnote" id="fn-4"><span class="p">Based on a Sci-Hub dataset released in 2016 (the most recent data available).</span></div>
+<div class="fn article-footnote" id="fn-5"><span class="p">These journals were identified by selecting journals with over a one thousand articles per year from those classified in the general “biomedical research†category. The full list of journals meeting these criteria were: PLOS ONE, Nature, Science, Scientific Reports, PNAS, Nature Communication, PeerJ, and Science Advances.</span></div>
+<div class="fn article-footnote" id="fn-6"><span class="p">Ties between frequently cited specialties were resolved randomly; that is, if a paper cites exactly the same amount of papers from two NSF specialties, it was assigned to one of the two at random</span></div>
+<div class="fn article-footnote" id="fn-7"><span class="p">Citations were normalized using the population of WoS articles and reviews with a DOI.</span></div>
+</div></main><footer class="back">
+ <section class="ack" id="acknowledgements"><h2 class="heading">Acknowledgements</h2>
+ <p>The authors would like to thank Dorothea Salo, Kristin Antelman, and John Sack for extensive and valuable comments on a draft of this article. The author order of JP and HP was determined by coin flip, as is their custom.</p>
+ </section>
+ <div class="sec" id="additional-information">
+ <h2 class="heading">Additional Information and Declarations</h2>
+ <div class="fn-group" data-jats-content-type="competing-interests">
+ <h3 class="heading">Competing Interests</h3>
+<div class="fn" id="conflict-1" data-jats-fn-type="conflict"><p>Heather Piwowar and Jason Priem are founders of Impactstory, a non-profit company which makes Unpaywall, oaDOI, and other tools to improve scholarly communication.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="author-contributions">
+ <h3 class="heading">Author Contributions</h3>
+<div class="fn" id="contribution-1" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-1" data-jats-ref-type="contrib" data-jats-rid="author-1">Heather Piwowar</a>, <a class="xref xref-contrib" href="#author-2" data-jats-ref-type="contrib" data-jats-rid="author-2">Jason Priem</a> and <a class="xref xref-contrib" href="#author-9" data-jats-ref-type="contrib" data-jats-rid="author-9">Stefanie Haustein</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, prepared figures and/or tables, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-2" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-3" data-jats-ref-type="contrib" data-jats-rid="author-3">Vincent Larivière</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-3" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-4" data-jats-ref-type="contrib" data-jats-rid="author-4">Juan Pablo Alperin</a> conceived and designed the experiments, performed the experiments, analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-4" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-5" data-jats-ref-type="contrib" data-jats-rid="author-5">Lisa Matthias</a> performed the experiments, analyzed the data, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-5" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-6" data-jats-ref-type="contrib" data-jats-rid="author-6">Bree Norlander</a> analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-6" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-7" data-jats-ref-type="contrib" data-jats-rid="author-7">Ashley Farley</a> wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-7" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-8" data-jats-ref-type="contrib" data-jats-rid="author-8">Jevin West</a> reviewed drafts of the paper.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="other">
+ <h3 class="heading">Data Availability</h3>
+<div class="fn" id="addinfo-1">
+<p>The following information was supplied regarding data availability:</p>
+ <p>Zenodo: <a class="ext-link" href="http://doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://doi.org/10.5281/zenodo.837902</a>.</p>
+ <p>The datasets behind the analysis in this paper are openly available at <a class="ext-link" href="http://dx.doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://dx.doi.org/10.5281/zenodo.837902</a> and the R statistics code can be found at <a class="ext-link" href="https://github.com/Impactstory/oadoi-paper1" data-jats-ext-link-type="uri">https://github.com/Impactstory/oadoi-paper1</a>. The oaDOI code is open source at <a class="ext-link" href="https://github.com/impactstory/oadoi" data-jats-ext-link-type="uri">https://github.com/impactstory/oadoi</a> and information about accessing the oaDOI API and full dataset is at <a class="ext-link" href="https://oadoi.org/api" data-jats-ext-link-type="uri">https://oadoi.org/api</a>.</p>
+</div>
+</div>
+ <h3 class="heading">Funding</h3>
+<p>The authors received no funding for this work.</p>
+</div>
+ <section class="ref-list-container" id="references"><h2 class="heading">References</h2>
+<ul class="ref-list" data-jats-content-type="authoryear">
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-1">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017a</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">When the wolf finally arrives: big deal cancelations in North American Libraries</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2018-01-09">09 January 2018</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-2">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017b</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">The forbidden forecast: thinking about open access and library subscriptions</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-07-15">15 July 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-3">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Antelman</span> <span class="given-names" itemprop="givenName">K</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf">Leveraging the growth of open access in library collection decision making</a>.</cite> In: <span itemprop="name"><a class="conf-name" target="_blank" href="https://scholar.google.com/scholar_lookup?title=Proceeding%20from%20ACRL%202017:%20at%20the%20helm:%20leading%20transformation&amp;author=&amp;publication_year=2017">Proceeding from ACRL 2017: at the helm: leading transformation</a>.</span><span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-4">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <span class="article-title"> <span class="source">Proportion of open access peer-reviewed papers at the European and world levels–2004–2011</span>. </span><span class="institution">European Commission, Brussels</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-5">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">AF</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <span class="article-title"> <span class="source">Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013</span>. </span><span class="institution">European Commission</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-6">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Côté</span> <span class="given-names" itemprop="givenName">G</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Struck</span> <span class="given-names" itemprop="givenName">B</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Voorons</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom">Research impact of paywalled versus open access papers</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-7">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Berg</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">Measuring the scientific output and impact of NIGMS grants</a>.</cite> <span> <span class="comment">NIGMS Feedback Loop Blog [Blog post]. <a class="uri" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/</a>
+ </span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-8">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2016a</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2016.08.002">Hybrid open access—a longitudinal study</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">919</span>-<span class="lpage" itemprop="pageEnd">932</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-9">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B-C</span></span>.</b> <b class="year" itemprop="datePublished">2016b</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1021">The open access movement at a crossroad: are the big publishers and academic social media taking over?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">29</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">131</span>-<span class="lpage" itemprop="pageEnd">134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-10">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1096">Gold, green, and black open access</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">30</b></span>:<span class="fpage" itemprop="pageStart">173</span>-<span class="lpage" itemprop="pageEnd">175</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-11">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paetau</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014">Anatomy of green open access</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Association for Information Science and Technology</span></span> <b itemprop="volumeNumber">65</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">237</span>-<span class="lpage" itemprop="pageEnd">250</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-12">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Majlender</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Guðnason</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0011273">Open access to the scientific journal literature: situation 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e11273</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-13">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bohannon</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1126%2Fscience.352.6285.508">Who’s downloading pirated papers? Everyone</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science</span></span> <b itemprop="volumeNumber">352</b></span>(<span itemprop="issueNumber">6285</span>)</span>:<span class="fpage" itemprop="pageStart">508</span>-<span class="lpage" itemprop="pageEnd">512</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-14">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Boudry</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chartron</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-016-2225-6">Availability of digital object identifiers in publications archived by PubMed</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics March</span></span> <b itemprop="volumeNumber">110</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">1453</span>-<span class="lpage" itemprop="pageEnd">1469</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-15">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chawla</span> <span class="given-names" itemprop="givenName">D</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement">Publishers take ResearchGate to court, alleging massive copyright infringement</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science News</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-16">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F19322909.2013.795426">Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Web Librarianship</span></span> <b itemprop="volumeNumber">7</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">243</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-17">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Olijhoek</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2016.1182672">Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">42</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">108</span>-<span class="lpage" itemprop="pageEnd">115</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-18">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Craig</span> <span class="given-names" itemprop="givenName">ID</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Plume</span> <span class="given-names" itemprop="givenName">AM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McVeigh</span> <span class="given-names" itemprop="givenName">ME</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Pringle</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amin</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2007</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2007.04.001">Do open access articles have greater citation impact?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">1</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">239</span>-<span class="lpage" itemprop="pageEnd">248</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-19">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Creative Commons</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://creativecommons.org/licenses/by/4.0/">Attribution 4.0 International (CC BY 4.0)</a></cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-20">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1096%2Ffj.11-183988">Open access, readership, citations: a randomized controlled trial of scientific journal publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">FASEB Journal</span></span> <b itemprop="volumeNumber">25</b></span>:<span class="fpage" itemprop="pageStart">2129</span>-<span class="lpage" itemprop="pageEnd">2134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-21">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walters</span> <span class="given-names" itemprop="givenName">WH</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.3163%2F1536-5050.99.3.008">The impact of free access to the scientific literature: a review of recent research</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">99</b></span>:<span class="fpage" itemprop="pageStart">208</span>-<span class="lpage" itemprop="pageEnd">217</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-22">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fortney</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gonder</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <span class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html">A social networking site is not an open access repository</a>. <span class="source">Office of Scholarly Communication</span>. </span><span class="institution">University of California</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-23">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0013636">Self-selected or mandated, open access increases citation impact for higher quality research</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">10</span>)</span>:<span class="fpage" itemprop="pageStart">e13636</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-24">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/1206.3664">Green and gold open access percentages and growth, by discipline</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-25">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gorraiz</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Melero-Fuentes</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gumpenbergera</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Valderrama-Zuriánc</span> <span class="given-names" itemprop="givenName">J-C</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2015.11.008">Availability of digital object identifiers (DOIs) in web of science and scopus</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">98</span>-<span class="lpage" itemprop="pageEnd">109</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-26">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greshake</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.11366.1">Looking into Pandora’s Box: the content of <i>Sci-Hub</i> and its usage [version 1; referees: 2 approved, 2 approved with reservations]</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000Research</span></span> <b itemprop="volumeNumber">6</b></span> <span class="comment">Article 541</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-27">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>.</b> <b class="year" itemprop="datePublished">2006</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/cs/0606079">Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-28">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Vallières</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hitchcock</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Oppenheim</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hilf</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2008.10765150">The access/impact problem and the green and gold roads to open access: an update</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">34</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">36</span>-<span class="lpage" itemprop="pageEnd">40</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-29">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Himmelstein</span> <span class="given-names" itemprop="givenName">DS</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Romero</span> <span class="given-names" itemprop="givenName">AR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McLaughlin</span> <span class="given-names" itemprop="givenName">SR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tzovaras</span> <span class="given-names" itemprop="givenName">BG</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greene</span> <span class="given-names" itemprop="givenName">CS</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1">Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)</a></cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ Preprints</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-30">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jamali</span> <span class="given-names" itemprop="givenName">HR</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-017-2291-4">Copyright compliance and infringement in ResearchGate full-text journal articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">112</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">241</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-31">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1186%2F1741-7015-10-124">Anatomy of open access publishing: a study of longitudinal development and internal structure</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Medicine</span></span> <b itemprop="volumeNumber">10</b></span> <span class="comment">Article 124</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-32">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fasi.22856">Delayed open access: an overlooked high-impact category of openly available scientific literature</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the American Society for Information Science and Technology</span></span> <b itemprop="volumeNumber">64</b></span>(<span itemprop="issueNumber">7</span>)</span>:<span class="fpage" itemprop="pageStart">1323</span>-<span class="lpage" itemprop="pageEnd">1329</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-33">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bukvova</span> <span class="given-names" itemprop="givenName">H</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nyman</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0020961">The development of open access journal publishing from 1993 to 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">6</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e20961</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-34">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Matsubayashi</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kurata</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Sakai Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mine</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ueda</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009">Status of open access in the biomedical field in 2005</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">97</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">4</span>-<span class="lpage" itemprop="pageEnd">11</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-35">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McCabe</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Snyder</span> <span class="given-names" itemprop="givenName">C</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1111%2Fecin.12064">Identifying the effect of open access on citations using a panel of science journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Economic Inquiry</span></span> <b itemprop="volumeNumber">52</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">1284</span>-<span class="lpage" itemprop="pageEnd">1300</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-36">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McKiernan</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bourne</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brown</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Buck</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kenall</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Lin</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McDougall</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nosek</span> <span class="given-names" itemprop="givenName">BA</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ram</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soderberg</span> <span class="given-names" itemprop="givenName">CK</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName"> Spies</span> <span class="given-names" itemprop="givenName"> JR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Updegrove</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Woo</span> <span class="given-names" itemprop="givenName">KH</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Yarkoni</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rodgers</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7554%2FeLife.16800">How open science helps researchers succeed</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">eLife</span></span> <b itemprop="volumeNumber">5</b></span>:<span class="elocation-id" itemprop="pageStart">e16800</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-37">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paul-Hus</span> <span class="given-names" itemprop="givenName">A</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-015-1765-5">The journal coverage of Web of Science and Scopus: a comparative analysis</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">106</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">213</span>-<span class="lpage" itemprop="pageEnd">228</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-38">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ottaviani</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0159614">The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">11</b></span>(<span itemprop="issueNumber">8</span>)</span>:<span class="fpage" itemprop="pageStart">e0159614</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-39">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Packer</span> <span class="given-names" itemprop="givenName">AL</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010">The SciELO open access: a gold way from the south</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Canadian Journal of Higher Education</span></span> <b itemprop="volumeNumber">39</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">111</span>-<span class="lpage" itemprop="pageEnd">126</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-40">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">PLOS</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication">Reviewer guidelines: criteria for publication</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-41">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Schiermeier</span> <span class="given-names" itemprop="givenName">Q</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mega</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1038%2Fnature.2016.21223">Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Nature News</span></span> <b itemprop="volumeNumber">541</b></span>(<span itemprop="issueNumber">7635</span>)</span>:<span class="fpage" itemprop="pageStart">13</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-42">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Smith</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Haustein</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fei</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ridde</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>.</b></span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=">Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Health Research Policy and System</span></span> <span class="comment">In Press</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-43">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">SPARC Europe</span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/">The open access citation advantage: list of studies until 2015</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-44">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Suber</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://dash.harvard.edu/handle/1/4322580">Gratis and libre open access</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">SPARC Open Access Newsletter, 124</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-45">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D">The open access citation advantage</a>.</cite> <span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-08-02">2 August 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-46">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">JP</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Waldner</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jacques</span> <span class="given-names" itemprop="givenName">DC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Masuzzo</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Collister</span> <span class="given-names" itemprop="givenName">LB</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hartgerink</span> <span class="given-names" itemprop="givenName">CH</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.8460.3">The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000 Research</span></span> <b itemprop="volumeNumber">5</b></span> <span class="comment">Article 632</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-47">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Universitat Konstanz</span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/">Teurer als die Wissenschaft erlaubt</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-48">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Université de Montréal</span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm">UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-49">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Wagner</span> <span class="given-names" itemprop="givenName">AB</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.5062%2FF4Q81B0W">Open access citation advantage: an annotated bibliography</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Issues in Science and Technology Librarianship</span></span> <b itemprop="volumeNumber">60</b></span>:<span class="fpage" itemprop="pageStart">2</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-50">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walker</span> <span class="given-names" itemprop="givenName">TJ</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soichi</span> <span class="given-names" itemprop="givenName">transl. T</span></span>.</b> <b class="year" itemprop="datePublished">1998</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1241%2Fjohokanri.41.678">Free internet access to traditional journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Information Processing and Management</span></span> <b itemprop="volumeNumber">41</b></span>(<span itemprop="issueNumber">9</span>)</span>:<span class="fpage" itemprop="pageStart">678</span>-<span class="lpage" itemprop="pageEnd">694</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-51">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2003</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003">The nine flavours of open access scholarly publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Postgraduate Medicine</span></span> <b itemprop="volumeNumber">49</b></span>:<span class="fpage" itemprop="pageStart">263</span>-<span class="lpage" itemprop="pageEnd">267</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Book" id="ref-52">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite class="article-title"></cite> <span itemprop="name"><a class="source" target="_blank" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=&amp;publication_year=2009">The access principle: the case for open access to research and scholarship</a></span><span> (<span class="edition">1 edition</span>). Cambridge: <span class="publisher">MIT Press</span>. </span>
+</div></li>
+</ul></section>
+ </footer></article>
+ </div>
+
+
+ <div id="related-research"></div>
+
+ <!-- annotations -->
+ <ul class="nav nav-tabs annotation-tabs-nav">
+ <li class="active"><a href="#questions" data-toggle="tab"><i class="icon-comments"></i> Questions
+ <span class="annotation-counter annotation-counter-questioning"></span></a></li>
+ <li><a href="#links" data-toggle="tab"><i class="icon-link"></i> Links
+ <span class="annotation-counter annotation-counter-linking"></span></a></li>
+ </ul>
+
+ <div class="tab-content annotation-tab-content">
+ <div class="tab-pane active" id="questions">
+ <div class="annotations" id="questions" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/questions/index.html?target=articles/4375&amp;_sort=score">Questions</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-question"
+ data-toggle="annotation-form"
+ data-target="#annotation-question-create-container"
+ rel="nofollow"
+ href="/questions.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Ask a question</a>
+ <div class="help-block annotation-learn-more"><a href="/about/FAQ/academic-contribution/" target="_blank">Learn more about Q&amp;A</a></div>
+ <div class="annotation-form-container"
+ id="annotation-question-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+
+ <div class="tab-pane" id="links">
+ <div class="annotations" id="links" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/links/index.html?target=articles/4375&amp;_sort=score">Links</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-link"
+ data-toggle="annotation-form"
+ data-target="#annotation-link-create-container"
+ rel="nofollow"
+ href="/links.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Add a link</a>
+ <div class="annotation-form-container"
+ id="annotation-link-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+
+ <div class="hidden-desktop" id="mobile-featured-jobs"></div>
+ </div>
+
+ <!-- Right sidebar -->
+ <div class="span3 offset1 article-sidebar visible-desktop">
+ <div id="article-sidebar-main-content" data-todo-href="/todos/19698/">
+ <div class="dimensions-stats-container">
+ <span class="__dimensions_badge_embed__" data-doi="10.7717/peerj.4375" data-hide-zero-citations="true" data-legend="always" data-style="small_circle"></span>
+ </div>
+
+
+ <div class="row-fluid item-action-buttons article-sidebar-item">
+ <div class="span12">
+ <a href="/benefits/" class="author-quote article-author-quote-link">
+ <div class="author-quote-text">
+ <span class="lead-in">I published in PeerJ</span> and it is very fast, has good editors, has consistently given good quality and rigorous reviews of my work, and produces visually appealing manuscripts.</div>
+ <div class="author-quote-details">
+ <span class="author-quote-name">Matthew Jackson</span><br>
+ PeerJ author
+ </div>
+</a> <div class="article-free-publishing-cta">
+ <div class="article-free-publishing-cta-title">Publish Free in 2020</div>
+ <div class="article-free-publishing-cta-subline">In PeerJ Chemistry Journals</div>
+ <a href="https://peerj.com/blog/post/115284881305/free-open-access-publishing-for-chemistry-and-computer-science-subject-areas" class="btn btn-article article-free-publishing-cta-btn">
+ Learn more
+ </a>
+ </div>
+ <div id="download-modal-trigger" class="js-download-modal-trigger btn btn-article btn-download btn-success mb-3 ">
+ Download
+</div> <!--<div class="content-cta-intro-text">Want alerts from articles like this?</div>-->
+<div id="content-alert-link" class="content-alert-link-btn" data-href="/content-alert/?aid=19698">
+ <div id="content-alert-button-label">
+ <i class="icon-envelope btn-content-alert-icon"></i>
+ Content <div class="content-alert-btn-lastword">Alert</div>
+ </div>
+ <div id="content-alert-button-loading" style="display:none;"><i class="icon-spin icon-spinner"></i> Loading...</div>
+</div>
+ <div class="content-cta-help-text">
+ Just enter your email
+ </div>
+ </div>
+ </div>
+
+
+
+
+ <nav class="article-sidebar-block">
+ <div class="sidebar-heading">
+ <i class="icon-wrench"></i> Tools & info
+ </div>
+ <ul class="nav nav-list article-item-metrics-counts" data-src="/articles/4375/counter/">
+ <li>
+ <a href="/articles/4375/reviews/"
+ rel="version-history">Peer Review history</a>
+ </li>
+
+
+ <li><a href="/articles/4375/citations/" data-toggle="modal" data-target="#citing-modal">See citing articles <span class="metric-counter citation-item-count">203</span></a></li>
+
+
+ <li><a href="#questions">Ask questions
+ <span class="metric-counter annotation-counter-questioning"></span></a></li>
+
+ <li><a href="#links">Add links
+ <span class="metric-counter annotation-counter-linking"></span></a></li>
+
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Visitors <span class="metric-counter" data-count="visitors">&nbsp;</span> <span class="pull-right metric-counter-details-cta">click for details</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Views <span class="metric-counter" data-count="views-html">&nbsp;</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Downloads <span class="metric-counter" data-count="views-pdf">&nbsp;</span></a></li>
+
+ <li><a id="item-flag-button" data-toggle="modal" href="#flagModal">Report problem with article</a></li>
+ </ul>
+ </nav>
+
+
+ <div id="related-research-sidebar"></div>
+
+</div>
+<nav class="article-sidebar-block follow" >
+ <div class="sidebar-heading">
+ <i class="icon-list-ul"></i> Outline
+ </div>
+ <div class="article-navigation"></div>
+ <div id="top-return" class="top-return">
+ <i class="icon-arrow-up"></i> Return to top
+ </div>
+
+ <div data-clone="#expertrxiv-related" data-source="/expertrxiv/related/?subjectIds=85%2C87%2C111&amp;subjects=Legal%20Issues%2C%20Science%20Policy%2C%20Data%20Science"></div>
+
+ </nav>
+
+<div class="subjects-navigation"></div>
+
+ <div id="article-identifiers">
+ <span class="article-meta-name">PubMed</span>
+ <a href="https://www.ncbi.nlm.nih.gov/pubmed/29456894"
+ id="article-identifier-pmid" target="_blank">29456894</a>
+ </div>
+ </div>
+ </div>
+
+
+<style>
+ .modal-loading-container{
+ display:flex;
+ justify-content:center;
+ color:#999;
+ padding:3rem;
+ }
+</style>
+
+<div id="download-article-modal" class="modal hide fade peer-review-article" style="">
+
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Download article</h3>
+ </div>
+
+ <div class="modal-body">
+ <div id="download-article-modal-loading" class="modal-loading-container" style="display:none;">
+ <i class="icon-spin icon-3x icon-spinner"></i>
+ </div>
+ <div id="download-article-modal-body">
+ <div id="download-modal-buttons-container">
+ <div class="download-modal-article-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</div>
+ <div class="mt-2 download-buttons">
+ <a target="_blank" download data-format="PDF" data-download-confirm-text="PDF downloaded" href="https://peerj.com/articles/4375.pdf" target="_blank" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> PDF (2.3MB)</a>
+ <a target="_blank" data-download-confirm-text="Mendeley opened" href="http://www.mendeley.com/import/?doi=10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Save to Mendeley</a>
+ <a target="_blank" data-download-confirm-text="Readcube article opened" href="http://www.readcube.com/articles/10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Read in ReadCube</a>
+ <a target="_blank" data-format="RIS" data-download-confirm-text="RIS file downloaded" href="https://peerj.com/articles/4375.ris" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> RIS</a>
+ <a target="_blank" data-format="XML" data-download-confirm-text="XML file downloaded" href="https://peerj.com/articles/4375.xml" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> XML</a>
+ <a target="_blank" data-format="BibText" data-download-confirm-text="BibText file downloaded" href="https://peerj.com/articles/4375.bib" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> BibTeX</a>
+
+ </div>
+ </div>
+
+ <div id="download-modal-downloading-message" style="display:none;">
+ <div class="text-center pt-4 pb-4">
+ <div>
+ <strong>Your download will start in a moment...</strong>
+ </div>
+ <div class="btn btn-secondary mt-4 js-close-download-modal">Close</div>
+ </div>
+ </div>
+
+ <div id="download-modal-signup-container" style="display:none;">
+
+<div class="download-modal-cta-container">
+
+ <div class="download-modal-confirm">
+ <div class="download-modal-confirm-title">
+ <i class="icon-tickcircle downloaded-tick"></i> <span class="download-modal-confirm-title-text"></span>
+ <i class="icon-chevron-down show-download-link"></i>
+ </div>
+ <a class="article-modal-download-url" href=""></a>
+ </div>
+
+
+ <div class="download-modal-cta-subtitle-small mt-2 mb-4 text-center">
+ Subscribe for subject updates
+ </div>
+
+ <div class="section-subscribe-container mb-2" style="display: flex;justify-content:center;">
+ <div>
+ <input type="text" placeholder="Email address" name="email" value="" class="form-control" id="download-subscribe-email">
+ </div>
+ <div class="ml-1">
+ <select name="freq" class="form-control" style="width: 100%;" id="download-subscribe-freq">
+ <option value="daily">Daily</option>
+ <option value="weekly">Weekly</option>
+ </select>
+ </div>
+ </div>
+
+ <div id="download-subscribe-error-container" class="mb-2 text-center text-error" style="display:none;"></div>
+
+
+ <button class="btn btn-primary btn-block btn-large mb-2 btn-modal-cta"
+ style="display: block;"
+ id="download-subscribe-submit"
+ data-url="/content-alert/download-subscribe?aid=19698"
+ data-signed-in=""
+ data-section-name="">
+ Subscribe
+ </button>
+
+ <a href="#" class="btn btn-block btn-link btn-large btn-modal-close js-close-download-modal mb-2">
+ Close
+ </a>
+
+</div>
+
+<script>
+ (function(){
+ $('#download-subscribe-submit').click(function(){
+
+ var button = $(this);
+ var url = button.data('url');
+ if(button.attr('disabled')) return;
+
+ $.get(url, function(response){
+
+ if(!response.token){
+ errorContainer.html('Server error, you have not been subscribed').show();
+ button.html('Subscribe').removeAttr('disabled');
+ return;
+ }
+
+ var errorContainer = $('#download-subscribe-error-container');
+ errorContainer.html('').hide();
+ button.html('<i class="icon-spin icon-spinner"></i>').attr('disabled', true);
+
+ var signedIn = button.data('signed-in');
+ var sectionName = button.data('section-name');
+ var data = {
+ _token: response.token
+ };
+
+ if(!signedIn) {
+ var email = $('#download-subscribe-email').val();
+ data.email = email;
+ data.freq = $('download-subscribe-freq').val();
+ }
+
+ $.ajax({
+ url: url,
+ method: 'POST',
+ data: data
+ }).success(function(response){
+ button.hide();
+ $('.js-close-download-modal').trigger('click');
+
+ PeerJ.Tools.ToastNotifications.add({
+ type: 'success',
+ title: 'Subscribed',
+ text: sectionName ? 'You subscribed to ' + sectionName : 'You subscribed to this article\'s subjects'
+ });
+
+ }).error(function(response){
+ if(response.responseJSON && response.responseJSON.errors){
+ errorContainer.html(response.responseJSON.errors[0]).show();
+ }
+ }).complete(function(){
+ button.html('Subscribe').removeAttr('disabled');
+ });
+
+ });
+ });
+
+ }());
+</script>
+ </div>
+ </div>
+ </div>
+
+ <div class="modal-footer" style="display:none;">
+ <div class="pull-right">
+ </div>
+
+ <span class="submit-copy submit-copy-btn btn cancel pull-left" id="modal-cancel" data-dismiss="modal">
+ Cancel
+ </span>
+ </div>
+</div>
+
+ <div id="ajax-form"></div>
+
+ <!-- Flag Modal -->
+ <div id="flagModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align: center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim">Report a problem</h3>
+ </div>
+
+ <form id="article-flag-form"
+ data-href="/issues/4375/flag/"
+ method="post">
+
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="alert alert-info">
+ <p><strong>Common use cases</strong><br>
+ Typos, corrections needed, missing information, abuse, etc
+ </p>
+
+ <p><strong>Our promise</strong><br>
+ PeerJ promises to address all issues as quickly and professionally as possible. We
+ thank you in advance for your patience and understanding.
+ </p>
+ </div>
+
+ <div id="flag-modal-result" style="margin-left:45px;">
+
+ <div>
+ <label><strong>Type of problem</strong></label>
+ <p>
+ <select id="moderation_flag_category" name="moderation_flag[category]" class="span4"><option value="typo">Typo</option><option value="metadata">Missing or incorrect metadata</option><option value="quality">Quality: PDF, figure, table, or data quality</option><option value="download">Download issues</option><option value="abuse">Abusive behavior</option><option value="misconduct">Research misconduct</option><option value="other">Other issue not listed above</option></select>
+
+ </p>
+ </div>
+ <div>
+ <label><strong>Details</strong> <i class="icon-large icon-question-sign" title="Please be as detailed as possible within the 500 character limit. Any details you provide will not be shown publicly." data-toggle="tooltip"></i></label>
+ <div>
+ <textarea id="moderation_flag_detail" name="moderation_flag[detail]" required="required" maxlength="500" class="span4" placeholder="Enter any details about this issue. Kept confidential with PeerJ staff." rows="5" data-counter-target="#flag-counter"></textarea>
+
+ <div style="margin:10px 0 0 0; color:#777777; float: left; display: block"><span id="flag-counter" class="label">500</span> characters remaining</div>
+ </div>
+ </div>
+
+ </div>
+
+ </div>
+ </form>
+ <div id="flag-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Cancel</button>
+ <input type="submit" class="btn btn-success save-flag-btn" value="Send report">
+ </div>
+</div>
+
+ <!-- Follow Publication Modal -->
+ <div id="followModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align:center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim" id="followModalLabel">Follow this publication for updates</h3>
+ </div>
+
+ <div>
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="row-fluid" style="margin-bottom: 15px">
+ <div class="span1">
+ <i class="icon-large icon-bullhorn"></i>
+ </div>
+ <div class="span11">
+ "Following" is like subscribing to any updates related to a publication.
+ These updates will appear in your home dashboard each time you visit PeerJ.
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span1">
+ <i class="icon-large icon-envelope"></i>
+ </div>
+ <div class="span11">
+ <p>
+ You can also choose to receive updates via daily or weekly email digests.
+ If you are following multiple publications then we will send you
+ no more than one email per day or week based on your preferences.
+ </p>
+ <p>
+ <em>Note: You are now also subscribed to the subject areas of this publication</em>
+ and will receive updates in the daily or weekly email digests if turned on.
+ You can <a href="/settings/details/">add specific subject areas</a> through your profile settings.
+ </p>
+ </div>
+ </div>
+
+ <hr>
+ <div id="follow-modal-result" style="margin-left:-40px;padding-top:7px;">
+ </div>
+
+ </div>
+
+ </div>
+
+ <div id="follow-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+ </div>
+
+ <!-- Unfollow Publication Modal -->
+ <div id="unfollowModal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Change notification settings or unfollow</h3>
+ </div>
+
+ <form id="article-unfollow-form"
+ data-href="/follow/publication/4375/1/"
+ method="put" class="form-horizontal">
+
+
+ <div id="unfollow-form-load-result" class="modal-body" data-href="/follow/publication/4375/edit/" style="max-height:350px;overflow-y:auto">
+ <p>Loading ...</p>
+ </div>
+
+ </form>
+ <div class="modal-footer">
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ <input type="submit" class="btn btn-success update-follow-btn" value="Update">
+ </div>
+</div>
+
+ <!-- Metrics Modal -->
+ <div id="metricsModal" class="modal hide">
+ <div class="modal-body" style="max-height:330px;overflow-y:auto">
+
+ <div class="row-fluid">
+ <div class="span12">
+ <p class="leadh2">Usage since published - updated daily</p>
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span8">
+ <h3 style="margin-bottom:10px">Social referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Twitter</div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Facebook</div>
+ <div class="span3" style="text-align:right;min-height:0">676</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Reddit</div>
+ <div class="span3" style="text-align:right;min-height:0">15</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">LinkedIn</div>
+ <div class="span3" style="text-align:right;min-height:0">11</div>
+ </div>
+
+ <h3 style="margin:30px 0 10px 0">Top referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From bookmark or typed URL
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">30,876</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Google search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">5,439</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Twitter
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From PeerJ Content Alert Emails
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">32</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Yahoo search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">20</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Webmail
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">3</div>
+ </div>
+ </div>
+
+ <div class="span4" style="overflow-x:hidden;">
+ <h3 style="margin-bottom:10px">Share this publication</h3>
+
+
+
+ <ul class="unstyled">
+ <li>
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+</ul>
+ <h3 style="margin-bottom:10px;margin-top:10px">Metrics</h3>
+
+ <!-- Altmetric -->
+ <div class="altmetric-embed" data-badge-popover="right"
+ data-link-target="_blank" data-doi="10.7717/peerj.4375"></div>
+ </div>
+ </div>
+
+ </div>
+
+ <div class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Wiki Modal -->
+
+ <!-- Links Modal -->
+ <div class="modal hide fade" id="article-links-modal">
+ <div class="modal-header">
+ <a rel="nofollow" data-dismiss="modal" aria-hidden="true" class="close">&times;</a>
+
+ <h3 class="modal-title">Links</h3>
+ </div>
+
+ <div class="modal-body"></div>
+
+ <div class="modal-footer">
+ <a rel="nofollow" href="/links.form?target=articles/4375" class="btn btn-primary">Add a link</a>
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Citing Modal -->
+ <div id="citing-modal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h2 class="slim"><i class="icon-copy"></i> Articles citing this paper</h2>
+ </div>
+ <div class="modal-body">Loading citing articles… <i class="icon icon-spinner icon-spin"></i></div>
+</div>
+
+ <!-- Graphical abstract modal -->
+
+ </div>
+
+
+ <div id="push"></div>
+ </div>
+
+ <footer id="footer">
+ <div class="foot">
+ <div class="container">
+
+ <div class="row">
+ <div class="span7">
+ <b>About us -</b> <a href="/about/" class="aboutLink" data-target="team">PeerJ team</a>
+ | <a href="/about/publications/" class="aboutLink" data-target="journals">Our publications</a> |
+ <a href="/benefits/">Benefits</a> | <a
+ href="/about/partnerships/" class="aboutLink" data-target="partnership">Partnerships</a> | <a
+ href="/about/endorsements/" class="aboutLink" data-target="endorsements">Endorsements</a>
+ <i class="icon-trophy"></i> <a href="/about/reviews/" class="aboutLink" data-target="reviews">Awards</a>
+ </div>
+ <div class="span5">
+ <b>Resources -</b> <a href="/about/FAQ/">FAQ</a> | <a
+ href="/about/careers/">Careers</a> | <a href="/about/press/">Press
+ room</a> | <a href="/about/terms/">Terms of use</a> | <a
+ href="/about/privacy/">Privacy</a> | <a
+ href="/about/contact/" class="aboutLink" data-target="contact">Contact</a>
+ </div>
+ <div class="span7">
+ <b>Academic boards -</b> <a href="/academic-boards/advisors/">Advisors</a> | <a
+ href="/academic-boards/editors/">Editors</a> |
+ <a href="/academic-boards/subjects/">Subject areas</a>
+ </div>
+ <div class="span5">
+ <b>Follow us -</b>
+ <a href="https://peerj.com/blog/">PeerJ blog</a> |
+ <a href="http://twitter.com/thePeerJ/" title="Follow on Twitter" data-toggle="tooltip">Twitter</a>
+ |
+ <a href="http://facebook.com/thePeerJ/" title="Follow on Facebook" data-toggle="tooltip">Facebook</a>
+ |
+ <a href="http://www.linkedin.com/company/peerj" title="Follow on LinkedIn" data-toggle="tooltip">LinkedIn</a>
+ |
+ <a href="https://www.instagram.com/thepeerj" title="Follow on Instagram" data-toggle="tooltip">Instagram</a>
+ |
+ <a href="http://www.pinterest.com/thepeerj/boards/" title="Follow on Pinterest" data-toggle="tooltip">Pinterest</a>
+ </div>
+ <div class="span7">
+ <b>Submission guides -</b>
+ <a href="/about/aims-and-scope"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/about/aims-and-scope/cs"><em>PeerJ Computer Science</em></a> |
+ <a href="/about/aims-and-scope/chemistry"><em>PeerJ Chemistry</em></a>
+ </div>
+ <div class="span5">
+ <b>Spread the word</b> -
+ <a href="/spread-the-word/activities/">Activities</a> |
+ <a href="/spread-the-word/resources/">Resources</a>
+ </div>
+ <div class="span7">&nbsp;</div>
+ <div class="span5">
+ <b>PeerJ feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom" rel="alternate" title="Articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1">RSS 1.0</a> |
+ <a href="/articles/index.rss2">RSS 2.0</a> |
+ <a href="/articles/index.json">JSON</a>
+ <br>
+
+ <b>PeerJ Computer Science feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom?journal=cs" rel="alternate" title="PeerJ Computer Science articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1?journal=cs">RSS 1.0</a> |
+ <a href="/articles/index.rss2?journal=cs">RSS 2.0</a> |
+ <a href="/articles/index.json?journal=cs">JSON</a>
+ <br>
+ <b>Archives - </b>
+ <a href="/archives/" rel="archives"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/archives/?journal=cs" rel="archives"><em>PeerJ Computer Science</em></a>
+ </div>
+
+</div>
+
+<div id="fb-root"></div>
+
+ <div class="row" style="margin-top:10px;font-size:12px">
+ <div class="span12" style="color:#888">
+
+ <div>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ</span> ISSN: 2167-8359</span>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ Comput. Sci.</span> ISSN: 2376-5992</span>
+ <span><span style="font-style:italic">PeerJ Preprints</span> ISSN: 2167-9843</span>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+ </footer>
+
+ <div id="alerts" data-async-alerts="/alerts/"></div>
+
+ <script src="/js/8d39319-35fca22.js"></script>
+ <script src="https://cdn.peerj.com/webpack/runtime.bfc7ab93.js"></script><script src="https://cdn.peerj.com/webpack/0.7880a6b6.js"></script><script src="https://cdn.peerj.com/webpack/1.24ea793f.js"></script><script src="https://cdn.peerj.com/webpack/vue-bundle.9bf24d69.js"></script>
+
+
+ <script src="/js/5d3c493-193ec0b.js"></script>
+
+ <script src="/js/c1dacd9-f146d62.js"></script>
+ <!--[if gt IE 8]><!-->
+ <script src="/assets/js/highlight/highlight.pack.js"></script>
+
+ <script>
+ $(function () {
+ // syntax highlighting for code blocks
+ $("pre > code").each(function() {
+ var node = $(this);
+
+ var language;
+
+ // JATS >=1.1
+ language = node.data('jats-language');
+
+ if (!language) {
+ // JATS <1.1
+ language = node.data('jats-preformat-type');
+
+ // ignore default 'code' type
+ if (language === 'code') {
+ language = null;
+ }
+ }
+
+ if (language) {
+ node.addClass('language-' + language);
+ }
+
+ hljs.highlightBlock(this);
+ });
+ });
+ </script>
+ <!--<![endif]-->
+
+ <script>
+ //initialise the follow button
+ $(function() {
+ PeerJ.Event.Follow.init();
+ });
+
+ //Show citations modal if query param exists
+ var urlParams = new URLSearchParams(window.location.search);
+ if(urlParams.has('citations')){
+ $('#citing-modal').modal('show');
+ }
+
+ </script>
+
+
+<script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ messageStyle: "none",
+ imageFont: null,
+ "CommonHTML": {
+ linebreaks: { automatic: true },
+ scale: 95
+ },
+ "HTML-CSS": {
+ linebreaks: { automatic: true },
+ scale: 90
+ },
+ menuSettings: {
+ zoom: "Click"
+ }
+ });
+
+ MathJax.Ajax.config.root = "/bundles/peerjmathjax/MathJax/";
+</script>
+
+<script src="/bundles/peerjmathjax/MathJax/MathJax.js?config=TeX-MML-AM_HTMLorMML,Safe&noContrib"></script>
+
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research"}'></script>
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research-sidebar"}'></script>
+ <script async src="https://badge.dimensions.ai/badge.js" charset="utf-8"></script>
+
+ <div id="content-alert-container"></div>
+
+ <div id="toast-container"></div>
+
+ <div id="vue-notifications"></div>
+
+ <div id="vue-confirm-modal"></div>
+
+ <script>
+ $(PeerJ.Home.Banner.init);
+ </script>
+
+ </body>
+</html>
diff --git a/python/tests/files/scielo_article.jats.xml b/python/tests/files/scielo_article.jats.xml
new file mode 100644
index 0000000..08c864e
--- /dev/null
+++ b/python/tests/files/scielo_article.jats.xml
@@ -0,0 +1,336 @@
+<?xml version="1.0" encoding="ISO-8859-1"?><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<front>
+<journal-meta>
+<journal-id>1683-9803</journal-id>
+<journal-title><![CDATA[Pediatría (Asunción)]]></journal-title>
+<abbrev-journal-title><![CDATA[Pediatr. (Asunción)]]></abbrev-journal-title>
+<issn>1683-9803</issn>
+<publisher>
+<publisher-name><![CDATA[Sociedad Paraguaya de Pediatría]]></publisher-name>
+</publisher>
+</journal-meta>
+<article-meta>
+<article-id>S1683-98032015000200002</article-id>
+<article-id pub-id-type="doi">10.18004/ped.2015.agosto.102-107</article-id>
+<title-group>
+<article-title xml:lang="es"><![CDATA[Prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años en las comunidades indígenas de Yby Yau y Azote’y, 2011]]></article-title>
+<article-title xml:lang="en"><![CDATA[Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011]]></article-title>
+</title-group>
+<contrib-group>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Valiente]]></surname>
+<given-names><![CDATA[Syntia Carolina]]></given-names>
+</name>
+<xref ref-type="aff" rid="A01"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Cañete]]></surname>
+<given-names><![CDATA[Manuel]]></given-names>
+</name>
+<xref ref-type="aff" rid="A02"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Cohene Velazquez]]></surname>
+<given-names><![CDATA[Bartola]]></given-names>
+</name>
+<xref ref-type="aff" rid="A03"/>
+</contrib>
+</contrib-group>
+<aff id="A01">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A02">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A03">
+<institution><![CDATA[,Puesto de Salud de Paso Tuya. Azote’y. Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<pub-date pub-type="pub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<pub-date pub-type="epub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<volume>42</volume>
+<numero>2</numero>
+<fpage>102</fpage>
+<lpage>107</lpage>
+<copyright-statement/>
+<copyright-year/>
+<self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_arttext&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_abstract&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_pdf&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><abstract abstract-type="short" xml:lang="es"><p><![CDATA[Introducción: La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrición. La desnutrición infantil no es solo un problema de falta de alimentos, es un conflicto social más profundo. La prevalencia de desnutrición en menores de 5 años del país es de 5,9% según datos del Instituto Nacional de Alimentación y Nutrición. Objetivo: Determinar la prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años de las comunidades indígenas de Yby Yaú y Azote’y. Materiales y Métodos: Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identificó la prevalencia de desnutrición infantil en niños indígenas de las etnias Pa'i Tavyterã y Mbya Guaraní de 11 comunidades indígenas de Yby Yau y Azote’y. Fueron examinados 349 menores de 5 años de edad. Para la evaluación del estado nutricional se utilizó la curva de crecimiento de la OMS. Los niños/as fueron pesados/as en balanzas mecánicas. Para la medida de la altura, los mayores de dos años fueron medidos con el tallimetro y los menores de 2 años con cinta métrica. Resultados: Se observó desnutrición en 53 niños que equivale al 15% de la muestra. De estos 60,4% padecían de desnutrición moderada y 39,6% desnutrición grave. El mayor porcentaje de desnutrición se encontró en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los niños tenían desnutrición crónica. Conclusiones: La prevalencia de desnutrición en indígenas en Yby Yaú y Azote’y es de 15%, lo que sobrepasa los índices de desnutrición en menores de 5 años del país.]]></p></abstract>
+<abstract abstract-type="short" xml:lang="en"><p><![CDATA[Introduction: Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. Objective: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Yaú Yby. Materials and Methods: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyterá and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. Results: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. Conclusions: The prevalence of malnutrition in indigenous children in Yby Yaú and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.]]></p></abstract>
+<kwd-group>
+<kwd lng="es"><![CDATA[Desnutrición aguda]]></kwd>
+<kwd lng="es"><![CDATA[desnutrición crónica]]></kwd>
+<kwd lng="es"><![CDATA[indígenas]]></kwd>
+<kwd lng="en"><![CDATA[Acute malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[chronic malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[indigenous]]></kwd>
+</kwd-group>
+</article-meta>
+</front><body><![CDATA[ <p align="right"><font size="3" face="Verdana"><b>ART&Iacute;CULO ORIGINAL</b></font></p> <p align="left">&nbsp;</p> <p align="left"><font size="4" face="Verdana"><b>Prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en&nbsp; ni&ntilde;os menores de 5 a&ntilde;os en las comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y, 2011</b></font></p> <p align="left"><font size="3" face="Verdana"><b><i>Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011</i></b></font></p> <p align="center">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>Syntia Carolina Ruiz Valiente<sup>(1)</sup>, Manuel Ruiz Ca&ntilde;ete<sup>(2)</sup>, Bartola Cohene Velazquez<sup>(3)</sup></b></font></p> <p align="left"> <font size="2" face="Verdana">1. Hospital General Pedi&aacute;trico Ni&ntilde;os Acosta &Ntilde;u. Reducto-San Lorenzo, Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">2. Centro de Salud de Yby Yau. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">3. Puesto de Salud de Paso Tuya. Azote&rsquo;y. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana"><b>Correspondencia</b>: Syntia Carolina Ruiz Valiente. E-mail: scrv_py@hotmail.com</font></p> ]]></body>
+<body><![CDATA[<p align="left"> <font size="2" face="Verdana">Recibido: 24/01/2015; Aceptado: 10/06/2015.</font></p> <p align="left"> <font size="2" face="Verdana"><i>Los autores declaran que no existen conflictos de inter&eacute;s en el presente estudio.</i></font></p> <p align="left">&nbsp;</p> <hr size="1" noshade> <p align="left"><font size="2" face="Verdana"><b>RESUMEN</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introducci&oacute;n: </b>La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrici&oacute;n. La desnutrici&oacute;n infantil no es solo un problema de falta de alimentos, es un conflicto social m&aacute;s profundo. La prevalencia de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s es de 5,9% seg&uacute;n datos del Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n. <b>Objetivo</b>: Determinar la prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby Ya&uacute; y Azote&rsquo;y. <b>Materiales y M&eacute;todos:</b> Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa'i Tavyter&atilde; y Mbya Guaran&iacute; de 11 comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y. Fueron examinados 349 menores de 5 a&ntilde;os de edad. Para la evaluaci&oacute;n del estado nutricional se utiliz&oacute; la curva de crecimiento de la OMS. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas. Para la medida de la altura, los mayores de dos a&ntilde;os fueron medidos con el tallimetro y los menores de 2 a&ntilde;os con cinta m&eacute;trica. <b>Resultados:</b> Se observ&oacute; desnutrici&oacute;n en 53 ni&ntilde;os que equivale al 15% de la muestra. De estos 60,4% padec&iacute;an de desnutrici&oacute;n moderada y 39,6% desnutrici&oacute;n grave. El mayor porcentaje de desnutrici&oacute;n se encontr&oacute; en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los ni&ntilde;os ten&iacute;an desnutrici&oacute;n cr&oacute;nica. <b>Conclusiones:</b> La prevalencia de desnutrici&oacute;n en ind&iacute;genas en Yby Ya&uacute; y Azote&rsquo;y es de 15%, lo que sobrepasa los &iacute;ndices de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s.</font></p> <p align="left"><font size="2" face="Verdana"><b>Palabras clave:</b> Desnutrici&oacute;n aguda, desnutrici&oacute;n cr&oacute;nica, ind&iacute;genas.</font></p> <p align="left">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>ABSTRACT</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introduction:</b> Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age&nbsp; in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. <b>Objective</b>: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Ya&uacute; Yby. <b>Materials and Methods</b>: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyter&aacute; and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. <b>Results</b>: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. <b>Conclusions</b>: The prevalence of malnutrition in indigenous children in Yby Ya&uacute; and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.</font></p> <p align="left"><font size="2" face="Verdana"><b>Keywords</b>: Acute malnutrition, chronic malnutrition, indigenous.</font></p> <hr size="1" noshade> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>INTRODUCCI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">La desnutrici&oacute;n es una enfermedad multisist&eacute;mica, que afecta todos los &oacute;rganos y sistemas del ser humano, es producida por una disminuci&oacute;n dr&aacute;stica, aguda o cr&oacute;nica, en la disponibilidad de nutrimentos, ya sea por ingesti&oacute;n insuficiente, inadecuada absorci&oacute;n, exceso de p&eacute;rdidas o la conjunci&oacute;n de dos o m&aacute;s de estos factores. Se manifiesta por grados de d&eacute;ficit antropom&eacute;trico, signos y s&iacute;ntomas cl&iacute;nicos y alteraciones bioqu&iacute;micas, hematol&oacute;gicas e inmunol&oacute;gicas (1).</font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n ind&iacute;gena est&aacute; gravemente afectada por este problema, tal vez por ser un estrato olvidado y descuidado por la poblaci&oacute;n en general y por el estado paraguayo. A pesar de las leyes, y de todos los proyectos que favorecen a esta esfera de la sociedad, a&uacute;n existe un abismo inimaginable entre lo ideal y lo real. Mientras se elaboran programas que buscan dar mejores condiciones de vida a estas comunidades, que la mayor&iacute;a de las veces solo quedan plasmados en el papel, los &iacute;ndices de desnutrici&oacute;n son alarmantes. Esto se debe probablemente a que en la sociedad posmoderna, la deforestaci&oacute;n, el uso de agrot&oacute;xicos, la invasi&oacute;n de los terratenientes despoj&oacute; a los nativos de sus tierras, oblig&aacute;ndolos a vivir en situaciones carenciales, pues estos debido a su cultura esperan que la naturaleza les ofrezca el sustento diario. Las costumbres, la econom&iacute;a y la religi&oacute;n en las etnias Paí Tavyter&atilde; y Mby`a Guaran&iacute; est&aacute;n &iacute;ntimamente relacionadas a la producci&oacute;n alimenticia e ingesta.</font></p> <p align="left"><font size="2" face="Verdana">Para el nativo guaran&iacute; es muy dif&iacute;cil comprender que el hombre es el que debe producir alimento para su sustento, pero como la sociedad actual obliga a ello, estos por no conseguir adaptarse a los cambios que se produjeron, est&aacute;n m&aacute;s expuestos a las carencias alimentarias. Seg&uacute;n datos del gobierno central en el 2008, 41,8% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os padec&iacute;an de desnutrici&oacute;n.</font></p> <p align="left"><font size="2" face="Verdana">En un estudio realizado en M&eacute;xico, la prevalencia de desnutrici&oacute;n en ind&iacute;genas fue 39,4%(2). Un 44% present&oacute; uno o m&aacute;s signos cl&iacute;nicos de malnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Encuestas y Censos del Ecuador (2001 y 2006) 40,1% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os tienen desnutrici&oacute;n cr&oacute;nica (3).</font></p> <p align="left"><font size="2" face="Verdana">En Caracas, se hizo un estudio con la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, y ellos obtuvieron el siguiente resultado: El diagn&oacute;stico nutricional hallado con mayor frecuencia fue Nutrici&oacute;n normal (55%) seguida por Desnutrici&oacute;n Subcl&iacute;nica (15%) y Desnutrici&oacute;n Leve (12%). En l&iacute;neas generales, un 55% de la poblaci&oacute;n se encontraba en rangos de nutrici&oacute;n normal, mientras el 45% restante presentaba problema de malnutrici&oacute;n comprendiendo &eacute;sta por d&eacute;ficit y por exceso (4).</font></p> <p align="left"><font size="2" face="Verdana">En el Brasil en un estudio realizado para determinar el perfil nutricional de los abor&iacute;genes menores de 5 a&ntilde;os de Kaing&aacute;ngen Paran&aacute; vieron que cuando utilizado los criterios propuestos por la OMS, se registr&oacute; una alta prevalencia de d&eacute;ficit Estatura/Edad, con uno en cuatro ni&ntilde;os (24,8%) que presentaba este diagn&oacute;stico. El d&eacute;ficit de Peso/Edad fue diagnosticado en 9,2% de los ni&ntilde;os evaluados. Los &iacute;ndices de peso para la altura diagnosticaron solo tres ni&ntilde;os (2,1%) como desnutridas agudas (5).</font></p> <p align="left"><font size="2" face="Verdana">En otro estudio realizado tambi&eacute;n en el Brasil, esta vez en Amazonia, con ni&ntilde;os de la etnia Suru&iacute; se observ&oacute; que los porcentajes de los ni&ntilde;os con d&eacute;ficit en los &iacute;ndices de estatura para la edad fue 31,4%, peso para la edad 12,4% y peso para la estatura 0% (6).</font></p> <p align="left"><font size="2" face="Verdana">El objetivo del presente estudio es determinar la prevalencia de desnutrici&oacute;n en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y y conocer el comportamiento alimentario de los ni&ntilde;os/as de las comunidades ind&iacute;genas estudiadas.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>MATERIALES Y M&Eacute;TODOS</b></font></p> <p align="left"><font size="2" face="Verdana">Estudio transversal, descriptivo realizado en el periodo de enero a abril del a&ntilde;o 2011, donde se identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa&#297; Tavyter&atilde; y Mby`a Guaran&iacute; en los distritos de Yby-Ya&uacute; y Azote&rsquo;y.</font></p> <p align="left"><font size="2" face="Verdana">El tama&ntilde;o muestral total fue de 370 ni&ntilde;os, determinado a trav&eacute;s de censo realizado por el Centro de Salud de Yby-Ya&uacute; y el Puesto de Salud de Paso Tuya. Para los fines del estudio fueron identificados 349 ni&ntilde;os (94.3%) de ni&ntilde;os reci&eacute;n nacidos a menores de 5 a&ntilde;os en los distritos de Yby-Ya&uacute; y Azote'y.</font></p> <p align="left"><font size="2" face="Verdana">Las etnias que se encuentran dentro del &aacute;rea de estudio est&aacute; compuesta por los mby`a guaran&iacute; y los pa&#297; tavyter&atilde;, distribuidas en las siguientes comunidades ind&iacute;genas: Vy'apav&#7869;, Yrapey, Guyrakeha, Guyra &Ntilde;e'engatuamba, Satí;, San Juan, Mbery'o Jaguarymi, Ka'aguy Poty Rory, Yvyra'ija, Tukambiju y Takuaritiy.</font></p> <p align="left"><font size="2" face="Verdana">El trabajo se realiz&oacute; por concentraci&oacute;n, en los locales fijados por los l&iacute;deres de las distintas comunidades. Fue aplicado un cuestionario a las madres, creado para el efecto por medio de entrevista. La edad de los ni&ntilde;os fue dada por las madres, pues la mayor&iacute;a de estas no cuentan con registro de nacimiento, ni siquiera certificado de nacido vivo.</font></p> <p align="left"><font size="2" face="Verdana">Para la evaluaci&oacute;n del estado nutricional de los ni&ntilde;os se opt&oacute; por la curva del gr&aacute;fico de crecimiento de la Organizaci&oacute;n Mundial de la Salud (OMS) lo cual est&aacute; contenido en la libreta del ni&ntilde;o y la ni&ntilde;a. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas, los que ya consegu&iacute;an quedarse de pie fueron pesados en balanza de pie y los ni&ntilde;os menores de 1 a&ntilde;o en balanzas colgantes.</font></p> <p align="left"><font size="2" face="Verdana">Para la medida de la altura, los ni&ntilde;os mayores de dos a&ntilde;os fueron colocados en posici&oacute;n de pie, bien rectos, y fueron medidos con el tallimetro. La talla de los ni&ntilde;os menores de 2 a&ntilde;os fue realizada con cinta m&eacute;trica con el ni&ntilde;o/a en dec&uacute;bito supino en superficie recta.</font></p> <p align="left"><font size="2" face="Verdana">Los datos fueron analizados manualmente, y los gr&aacute;ficos confeccionados con el programa Microsoft Office Excel 2007.</font></p> <p align="justify">&nbsp;</p> ]]></body>
+<body><![CDATA[<p align="left"><font size="3" face="Verdana"><b>RESULTADOS</b></font></p> <p align="left"><font size="2" face="Verdana">Se evaluaron 349 ni&ntilde;os, que representan el 94,3% del total de abor&iacute;genes menores de 5 a&ntilde;os de las comunidades de Yby-Ya&uacute; y Azote&rsquo;y. Del total de 349 ni&ntilde;os, 69 % (240) son Paí; Tavyter&atilde; y 31% (109) Mby`a Guaran&iacute;. </font></p> <p align="left"><font size="2" face="Verdana">La comunidad con el mayor porcentaje de ni&ntilde;os fue la de Vy'&atilde;pav&#7869; (36,4%), y la de menor frecuencia fue la comunidad de Tekoha Kag&atilde;t&atilde;, que es una comunidad reci&eacute;n formada localizada en Pasi&ntilde;o (<a href="#2a02f1">Figura 1</a>).</font></p> <p align="center"><a name="2a02f1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f1.jpg"></p> <p align="left"><font size="2" face="Verdana">Viendo el perfil nutricional de los ni&ntilde;os, se pudo observar que 61% de los ni&ntilde;os/as no est&aacute;n desnutridos, 24% de los ni&ntilde;os/as est&aacute;n en riesgo de desnutrici&oacute;n y 15% est&aacute;n con desnutrici&oacute;n. Aunque se trata de un estrato social desfavorecido tambi&eacute;n se observa &iacute;ndice de sobrepeso y obesidad, en las comunidades de Vy'&atilde;pav&#7869; e Yrapey (<a href="#2a02f2">Figura 2</a>).</font></p> <p align="center"><a name="2a02f2"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f2.jpg"></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">Teniendo presente los gr&aacute;ficos de Talla/Edad la prevalencia de desnutrici&oacute;n cr&oacute;nica es bastante elevada, pues 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. El mayor &iacute;ndice de desnutrici&oacute;n se encuentran en los primeros 24 meses de vida (<a href="#2a02t1">Tabla 1</a>). De los 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. Siendo que el mayor porcentaje de desnutrici&oacute;n se observa en Vy'&atilde;pav&#7869;.</font></p> <p align="center"><a name="2a02t1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02t1.jpg"></p> <p align="left"><font size="2" face="Verdana">Se estudi&oacute; adem&aacute;s el comportamiento alimentario de estos ni&ntilde;os, viendo que alimentos preferencialmente hacen parte de su dieta y la edad de introducci&oacute;n de los mismos, la mayor&iacute;a de las madres introducen alg&uacute;n tipo alimento entre los 6 y 8 meses de edad (<a href="#2a02f3">Figura 3</a>) y los primeros alimentos introducidos dependen del lugar donde estos habitan. El caldo de pescado es uno de los primeros alimentos introducidos en las comunidades que viven cerca de los r&iacute;os, entretanto el 60% inician la alimentaci&oacute;n con caldo de arroz y caldo de fideo.</font></p> <p align="center"><a name="2a02f3"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f3.jpg"></p> <p align="left"><font size="2" face="Verdana">Al observar la frecuencia en que se alimentan estos ni&ntilde;os, el 64% se alimenta tres veces al d&iacute;a, el 20% menos de 3 veces al d&iacute;a y solo el 16 % m&aacute;s de tres veces al d&iacute;a.</font></p> <p align="left"><font size="2" face="Verdana">El principal nutriente en la dieta son los carbohidratos, el 47% de los ni&ntilde;os consumen carbohidratos m&aacute;s de 5 veces por semana, y el 21% menos de 3 veces por semana. El mayor porcentaje de consumo de prote&iacute;nas se observa en las comunidades que se encuentran cerca de r&iacute;os (Guyra &Ntilde;e`engatuamba y Mbery'o Jaguarymi), siendo que 70% consume prote&iacute;nas menos de 3 veces por semana, y solo el 3% m&aacute;s de cinco veces por semana. El consumo de verduras y hortalizas es muy escaso, el 91% consume verduras y hortalizas menos de 3 veces por semana, el 2% m&aacute;s de 5 veces y 7% entre 3 y 5 veces por semana.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>DISCUSI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">A lo largo de toda la historia de la humanidad, la desnutrici&oacute;n ha sido una patolog&iacute;a de las clases sociales menos privilegiadas, son los que no poseen las condiciones necesarias para tener una vida digna, donde la educaci&oacute;n, salud, recursos econ&oacute;micos son miserables, donde esta dolencia alcanza su auge (7).</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n los datos del Censo realizado por la Unidad de Salud Ind&iacute;gena que se encuentra en el Distrito de Yby-Ya&uacute;, los Puestos de Salud de Yby- Ya&uacute; y Azote&rsquo;y en el tercer trimestre del A&ntilde;o 2010, se encontraron 328 ni&ntilde;os de hasta 60 meses (8). Al realizar los trabajos de campo, este n&uacute;mero se elev&oacute; a 349 individuos, por lo que se hizo un nuevo censo solo con los ni&ntilde;os de este grupo etario. Ese fen&oacute;meno tal vez, se deba a la migraciones que se desarrollan normalmente entre los guaran&iacute;. Al observar la historia, y tambi&eacute;n por la experiencia que se adquiri&oacute; durante el trabajo de campo, se pudo observar la familia ling&uuml;&iacute;stica a la cual pertenecen los mby`a y los paí; (la guaran&iacute;) son n&oacute;madas, es com&uacute;n que migren a otras comunidades, en un mismo Tekoha (9,10).</b></font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n diana fue de 370 ni&ntilde;os menores de 5 a&ntilde;os de los cuales se lleg&oacute; a entrevistar a las madres de 349 y se hizo las mediciones antropom&eacute;tricas posteriormente. En la mayor&iacute;a de las comunidades ind&iacute;genas se obtuvo el 100% de participaci&oacute;n, son excepciones las comunidades de Yrapey y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Del total de ni&ntilde;os/as, la etnia de mayor prevalencia fue la de Paí; Tavyter&atilde;. En relaci&oacute;n al sexo, las comunidades son bastante equilibradas, con una ligera prevalencia del sexo masculino sobre el femenino.</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n datos de la UNICEF en Paraguay se observa 3,4% de desnutrici&oacute;n aguda en ni&ntilde;os menores de 5 a&ntilde;os (11). La prevalencia de desnutrici&oacute;n en los ni&ntilde;os paraguayos menores de 5 a&ntilde;os en el &aacute;rea rural es de 5,9% y en el &aacute;rea urbana es de 4,5% (12). Existen pocas publicaciones sobre este tema en abor&iacute;genes menores de 5 a&ntilde;os, siendo que el mayor n&uacute;mero de publicaciones fue realizado por el Brasil (12,4%), M&eacute;xico (39,4%) y Ecuador.</b></font></p> <p align="left"><font size="2" face="Verdana">La prevalencia de desnutrici&oacute;n en las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y es de 15,2%, observando los gr&aacute;ficos de Peso/edad si de 2 a&ntilde;os y Peso/Talla en mayores de 2 a&ntilde;os y menores de 5 a&ntilde;os. Las comunidades donde la desnutrici&oacute;n son m&aacute;s prevalentes son Guyrakeha e Yvyra'ija; en Satí; y Tekoha Kagat&atilde; no se encontr&oacute; ni&ntilde;os desnutridos.</b></font></p> <p align="left"><font size="2" face="Verdana">De 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. El grupo con mayor &iacute;ndice de desnutrici&oacute;n, se encuentra durante los primeros 24 meses, pues es en esta etapa donde el organismo requiere una mayor cantidad de nutrientes por el mayor crecimiento. Adem&aacute;s, despu&eacute;s de los 6 meses se inicia la introducci&oacute;n de otros alimentos. Estos dos factores, asociados aumentan el &iacute;ndice de desnutrici&oacute;n en este grupo de edad.</b></font></p> <p align="left"><font size="2" face="Verdana">De la poblaci&oacute;n total de los ni&ntilde;os estudiados el 23,8% est&aacute;n con riesgo de desnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n (INAN) en el a&ntilde;o 2010, 13,6% de ni&ntilde;os menores de 5 a&ntilde;os del &aacute;rea urbana y 16,2% del &aacute;rea rural del Paraguay sufren desnutrici&oacute;n cr&oacute;nica. En una encuesta realizada por la Direcci&oacute;n General de Estad&iacute;stica, Encuestas y Censos en el a&ntilde;o 2008, 41,8% de los ni&ntilde;os/as ind&iacute;genas menores de cinco a&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Observadas las medidas de Talla/Edad el 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Ese dato es alarmante, porque la desnutrici&oacute;n cr&oacute;nica es consecuencia de una carencia prolongada de alimentos o enfermedades sucesivas. En Tukambiju, Mbery'o Jaguarymi, Guyrakeha, Yvyra'ija y Satí; son comunidades con una prevalencia mayor al 80% de ni&ntilde;os/as con talla baja para la edad.</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">El &iacute;ndice de desnutrici&oacute;n en ind&iacute;genas en los distritos de Yby-Ya&uacute; y Azote&rsquo;y, sobrepasa la prevalencia general de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s, lo cual est&aacute; alrededor de 5.9% seg&uacute;n datos del INAN.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas se puede observar que un porcentaje razonable introduce alimentos entre los 6 meses y antes de los 9 meses. El porcentaje de los que introducen antes de los 6 meses es de 18,6% y entre los 9 meses y un a&ntilde;o es de 27%. Se pudo observar que, ocho ni&ntilde;os tuvieron lactancia materna exclusiva por m&aacute;s de 1 a&ntilde;o. Todos los ni&ntilde;os/as con lactancia materna exclusiva en la fecha de la recolecci&oacute;n de datos ten&iacute;a menos de 6 meses o 6 meses. El caldo de fideo y de arroz ocupa el primer y segundo lugar respectivamente como primer alimento introducido por las madres. Los alimentos que deber&iacute;an ser introducidos inicialmente como el pur&eacute; de frutas y verduras ocupan un peque&ntilde;o porcentaje en la lista. Otros alimentos que se tendr&iacute;an que introducir despu&eacute;s de los 9 meses, de preferencia a los un a&ntilde;o, como por ejemplo el caldo de poroto, caldo de pescado, leche de vaca y huevo son los primeros alimentos que se introducen.</b></font></p> <p align="left"><font size="2" face="Verdana">El 64% de los ni&ntilde;os se alimentan tres veces al d&iacute;a, el 20,5% menos de tres veces y 15,5% m&aacute;s de tres veces al d&iacute;a.</b></font></p> <p align="left"><font size="2" face="Verdana">El 69,5% de los ni&ntilde;os/as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y consumen prote&iacute;nas menos de tres veces por semana; 27,3% consumen de tres a cinco veces por semana los diferentes tipos de prote&iacute;nas, teniendo predominancia el consumo de pez. Solo 3,2% consume prote&iacute;nas m&aacute;s de 5 veces. Las comunidades que viven cerca de bosques, r&iacute;os o arroyos son los que m&aacute;s consumen prote&iacute;nas.</b></font></p> <p align="left"><font size="2" face="Verdana">Los carbohidratos son la principal fuente de alimentaci&oacute;n de los ni&ntilde;os y ni&ntilde;as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y. Eso se debe a que son los alimentos de m&aacute;s f&aacute;cil adquisici&oacute;n y los m&aacute;s accesibles econ&oacute;micamente hablando.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas el consumo de verduras y hortalizas es escaso. Las comunidades que m&aacute;s consumen verduras y hortalizas son Mberyo Jaguarymi y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Este trabajo refleja la realidad de las comunidades ind&iacute;genas de los dos distritos observados, no podemos extrapolar estas mismas cifras en el departamento de Concepci&oacute;n, o en todo el pa&iacute;s por el tama&ntilde;o de la muestra, es necesario hacer nuevos estudios con un tama&ntilde;o muestral mayor para obtener una visi&oacute;n del verdadero estado nutricional de los ni&ntilde;os ind&iacute;genas. El porcentaje de desnutrici&oacute;n es alto, pero se trata de distritos con no muchos recursos econ&oacute;micos, donde la pobreza es una realidad a&uacute;n en otros estratos sociales.</b></font></p> <p align="left"><font size="2" face="Verdana">La realidad ind&iacute;gena es un problema real, y una manera de reducir estas cifras es ense&ntilde;&aacute;ndoles a producir su propio alimento. Para ello no debemos luchar con su cultura ni intentar hacerlos ver el mundo a trav&eacute;s de nuestra realidad, sino dentro de sus costumbres encontrar formas de que ellos tengan condiciones de un mejor porvenir.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>AGRADECIMIENTOS</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">A las comunidades ind&iacute;genas que participaron en nuestro estudio, los profesionales de blanco del Centro de Salud de Yby-Yau y Azote&rsquo;y, a la Comunidad de Hermanas de la Divina Providencia de Yby-Yau, a la Dra. Blanca Villalba y a la Dra. Gloria Mart&iacute;nez.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>REFERENCIAS</b></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">1. Monteiro CA. Fome, desnutri&ccedil;&atilde;o e pobreza: al&eacute;m da sem&acirc;ntica. Sa&uacute;de Soc. 2003;12(1):7-11. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102986&pid=S1683-9803201500020000200001&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">2. Vi&ntilde;as MR, Fr&iacute;as ML, Verd&uacute; JM. Entorno social y desnutrici&oacute;n en ni&ntilde;os de 1 a 4 a&ntilde;os de comunidades ind&iacute;genas de M&eacute;xico. Rev Esp Nutr Comunitaria. 2005;11(3):128-34. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102988&pid=S1683-9803201500020000200002&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">3. INEC. Ecuador: 40,1% de ind&iacute;genas con desnutrici&oacute;n cr&oacute;nica. Ecuador: Estudio del INEC; 2009. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102990&pid=S1683-9803201500020000200003&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">4. Chumpitaz D, Russo A, Del NogaL B, Case C, Lares M. Evaluaci&oacute;n nutricional de la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004. AVFT. 2006;25(1):26-31. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102992&pid=S1683-9803201500020000200004&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">5. Kuhl AM, Tittoni C, Leite MS, Bastos JL. Perfil Nutricional e fatores associados &agrave; ocorr&ecirc;ncia de desnutri&ccedil;&atilde;o entre crian&ccedil;as ind&iacute;genas Kaing&aacute;ng da Terra Ind&iacute;gena de Mangueirinha, Paran&aacute;, Brasil. Cad Sa&uacute;de P&uacute;blica. 2009;25(2):409-420. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102994&pid=S1683-9803201500020000200005&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">6. Orellana JD, Coimbra Jr. CE, Louren&ccedil;o AE, Santos RV. Estado nutricional e anemia en crian&ccedil;as Suru&iacute;, Amaz&ocirc;nia, Brasil. J Pediatr (Rio J). 2006;82(5):383-88. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102996&pid=S1683-9803201500020000200006&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">7. Organizaci&oacute;n de las Naciones Unidas. Foro permanente para las cuestiones ind&iacute;genas: informe sobre el quinto per&iacute;odo de sesiones (15 a 26 de mayo de 2006). Nueva York: Naciones Unidas; 2006. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102998&pid=S1683-9803201500020000200007&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">8. Centro de Salud de Yby-Yau. Censo local de las comunidades ind&iacute;genas. Yby-Yau; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103000&pid=S1683-9803201500020000200008&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">9. Chase-Sardi M, Brun A, Enciso MA. Situaci&oacute;n sociocultural, econ&oacute;mica, jur&iacute;dico-pol&iacute;tico actual de las comunidades ind&iacute;genas del Paraguay. Asunci&oacute;n: UCA; 1989. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103002&pid=S1683-9803201500020000200009&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">10. Meliá B, Grunberg G, Grunberg F. Paî -Tavyterã: etnograf&iacute;a guaran&iacute; del Paraguay contempor&aacute;neo. 2da. ed. Asunci&oacute;n: Centro de Estudios Antrop&oacute;logicos de la Universidad Cat&oacute;lica; 2008. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103004&pid=S1683-9803201500020000200010&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">11. FAO. Panorama de la seguridad alimentaria y nutricional en Am&eacute;rica Latina y el Caribe 2013. FAO; 2014. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103006&pid=S1683-9803201500020000200011&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --> </font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">12. Masi C, S&aacute;nchez Bernal S, Dallman D, Rodas A, Morinigo G, Mendoza L. Perfil nutricional de ni&ntilde;os menores de 5 a&ntilde;os que acuden a servicios p&uacute;blicos de salud en el Paraguay. Asunci&oacute;n: INAN; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103008&pid=S1683-9803201500020000200012&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> ]]></body><back>
+<ref-list>
+<ref id="B1">
+<label>1</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Monteiro]]></surname>
+<given-names><![CDATA[CA]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Fome, desnutrição e pobreza: além da semântica]]></article-title>
+<source><![CDATA[Saúde Soc]]></source>
+<year>2003</year>
+<volume>12</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>7-11</page-range></nlm-citation>
+</ref>
+<ref id="B2">
+<label>2</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Viñas]]></surname>
+<given-names><![CDATA[MR]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Frías]]></surname>
+<given-names><![CDATA[ML]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Verdú]]></surname>
+<given-names><![CDATA[JM]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="es"><![CDATA[Entorno social y desnutrición en niños de 1 a 4 años de comunidades indígenas de México]]></article-title>
+<source><![CDATA[Rev Esp Nutr Comunitaria]]></source>
+<year>2005</year>
+<volume>11</volume>
+<numero>3</numero>
+<issue>3</issue>
+<page-range>128-34</page-range></nlm-citation>
+</ref>
+<ref id="B3">
+<label>3</label><nlm-citation citation-type="book">
+<collab>INEC</collab>
+<source><![CDATA[Ecuador: 40,1% de indígenas con desnutrición crónica]]></source>
+<year>2009</year>
+<publisher-loc><![CDATA[Ecuador ]]></publisher-loc>
+<publisher-name><![CDATA[Estudio del INEC]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B4">
+<label>4</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chumpitaz]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Russo]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Del NogaL]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Case]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lares]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Evaluación nutricional de la población infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004]]></article-title>
+<source><![CDATA[AVFT]]></source>
+<year>2006</year>
+<volume>25</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>26-31</page-range></nlm-citation>
+</ref>
+<ref id="B5">
+<label>5</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Kuhl]]></surname>
+<given-names><![CDATA[AM]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Tittoni]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Leite]]></surname>
+<given-names><![CDATA[MS]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Bastos]]></surname>
+<given-names><![CDATA[JL]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Perfil Nutricional e fatores associados à ocorrência de desnutrição entre crianças indígenas Kaingáng da Terra Indígena de Mangueirinha, Paraná, Brasil]]></article-title>
+<source><![CDATA[Cad Saúde Pública]]></source>
+<year>2009</year>
+<volume>25</volume>
+<numero>2</numero>
+<issue>2</issue>
+<page-range>409-420</page-range></nlm-citation>
+</ref>
+<ref id="B6">
+<label>6</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Orellana]]></surname>
+<given-names><![CDATA[JD]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Coimbra Jr]]></surname>
+<given-names><![CDATA[CE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lourenço]]></surname>
+<given-names><![CDATA[AE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Santos]]></surname>
+<given-names><![CDATA[RV]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Estado nutricional e anemia en crianças Suruí, Amazônia, Brasil]]></article-title>
+<source><![CDATA[J Pediatr (Rio J)]]></source>
+<year>2006</year>
+<volume>82</volume>
+<numero>5</numero>
+<issue>5</issue>
+<page-range>383-88</page-range></nlm-citation>
+</ref>
+<ref id="B7">
+<label>7</label><nlm-citation citation-type="book">
+<collab>Organización de las Naciones Unidas</collab>
+<source><![CDATA[Foro permanente para las cuestiones indígenas: informe sobre el quinto período de sesiones (15 a 26 de mayo de 2006)]]></source>
+<year>2006</year>
+<publisher-loc><![CDATA[Nueva York ]]></publisher-loc>
+<publisher-name><![CDATA[Naciones Unidas]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B8">
+<label>8</label><nlm-citation citation-type="">
+<collab>Centro de Salud de Yby-Yau</collab>
+<source><![CDATA[Censo local de las comunidades indígenas]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Yby-Yau ]]></publisher-loc>
+</nlm-citation>
+</ref>
+<ref id="B9">
+<label>9</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chase-Sardi]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Brun]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Enciso]]></surname>
+<given-names><![CDATA[MA]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Situación sociocultural, económica, jurídico-político actual de las comunidades indígenas del Paraguay]]></source>
+<year>1989</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[UCA]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B10">
+<label>10</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Meliá]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[F]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Paî -Tavyterã: etnografía guaraní del Paraguay contemporáneo. 2da. ed]]></source>
+<year>2008</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[Centro de Estudios Antropólogicos de la Universidad Católica]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B11">
+<label>11</label><nlm-citation citation-type="book">
+<collab>FAO</collab>
+<source><![CDATA[Panorama de la seguridad alimentaria y nutricional en América Latina y el Caribe 2013]]></source>
+<year>2014</year>
+<publisher-name><![CDATA[FAO]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B12">
+<label>12</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Masi]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Sánchez Bernal]]></surname>
+<given-names><![CDATA[S]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Dallman]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Rodas]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Morinigo]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Mendoza]]></surname>
+<given-names><![CDATA[L]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Perfil nutricional de niños menores de 5 años que acuden a servicios públicos de salud en el Paraguay]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[INAN]]></publisher-name>
+</nlm-citation>
+</ref>
+</ref-list>
+</back>
+</article>
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index 3f84ea4..3839c99 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -27,21 +27,16 @@
"date": "2001",
"id": "b0",
"index": 0,
- "issue": null,
"journal": "Letters in the Alphabet",
- "publisher": null,
+ "pages": "1-11",
"title": "Everything is Wonderful",
- "url": null,
"volume": "20"},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
- "issue": null,
"journal": "The Dictionary",
- "publisher": null,
"title": "All about Facts",
- "url": null,
"volume": "14"}
],
"abstract": "Everything you ever wanted to know about nothing",
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 36d90ef..dce64bc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,17 +1,18 @@
+import json
+import struct
import pytest
-import struct
import responses
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from test_wayback import wayback_client, cdx_client
-
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
-with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
+with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f:
REAL_TEI_XML = f.read()
+
@pytest.fixture
def grobid_client():
client = GrobidClient(
@@ -19,61 +20,203 @@ def grobid_client():
)
return client
+
@responses.activate
def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=503,
- body=status)
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=503,
+ body=status,
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 503
- assert resp['status'] == "error"
+ assert resp["status_code"] == 503
+ assert resp["status"] == "error"
+
+
+@responses.activate
+def test_grobid_success_iso_8859(grobid_client):
+ """
+ This might have been the old GROBID behavior, with default encoding? Can't really remember.
+ """
+
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ # print(type(resp['tei_xml']))
+ # print(type(REAL_TEI_XML))
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1")
+
@responses.activate
def test_grobid_success(grobid_client):
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="application/xml; charset=UTF-8",
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 200
- assert resp['status'] == "success"
- #print(type(resp['tei_xml']))
- #print(type(REAL_TEI_XML))
- assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8")
+
@responses.activate
-def test_grobid_worker_cdx(grobid_client, wayback_client):
+def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
+ assert len(responses.calls) == worker.counts["total"]
+
+
+@responses.activate
+def test_grobid_refs_978(grobid_client):
+
+ with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f:
+ xml_bytes = f.read()
+ assert "\u2013".encode("utf-8") in xml_bytes
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=xml_bytes,
+ content_type="application/xml; charset=UTF-8",
+ )
- assert len(responses.calls) == worker.counts['total']
+ refs_row = grobid_client.crossref_refs(crossref_work)
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 3
+ assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"])
+
+ # test case of no references
+ crossref_work["message"]["reference"] = []
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # test that 'message' works also
+ refs_row = grobid_client.crossref_refs(crossref_work["message"])
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # grobid gets no additional POST from the above empty queries
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_grobid_refs_s104(grobid_client):
+
+ # test another file
+ with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f:
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=f.read(),
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # GROBID gets one more POST
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1017/s1047951103000064"
+ assert refs_row["source_ts"] == "2021-06-10T05:35:02Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 24
+ assert set([r["id"] for r in refs]) == set(
+ [
+ "S1047951103000064_ref025",
+ "S1047951103000064_ref013",
+ "S1047951103000064_ref012",
+ "S1047951103000064_ref041",
+ "S1047951103000064_ref002",
+ "S1047951103000064_ref043",
+ "S1047951103000064_ref037",
+ "S1047951103000064_ref035",
+ "S1047951103000064_ref003",
+ "S1047951103000064_ref005",
+ "S1047951103000064_ref017",
+ "S1047951103000064_ref016",
+ "S1047951103000064_ref001",
+ "S1047951103000064_ref039",
+ "S1047951103000064_ref032",
+ "S1047951103000064_ref014",
+ "S1047951103000064_ref008",
+ "S1047951103000064_ref038",
+ "S1047951103000064_ref018",
+ "S1047951103000064_ref027",
+ "S1047951103000064_ref034",
+ "S1047951103000064_ref044",
+ "S1047951103000064_ref006",
+ "S1047951103000064_ref030",
+ ]
+ )
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..b00a88d 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,22 +1,28 @@
-
-import xml
import json
+import xml
+
import pytest
-from grobid2json import *
+from grobid_tei_xml import parse_document_xml
def test_small_xml():
-
- with open('tests/files/small.xml', 'r') as f:
+ """
+ This used to be a test of grobid2json; now it is a compatability test for
+ the to_legacy_dict() feature of grobid_tei_xml.
+ """
+
+ with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
- with open('tests/files/small.json', 'r') as f:
- json_form = json.loads(f.read())
+ with open("tests/files/small.json", "r") as f:
+ json_form = json.loads(f.read())
+
+ tei_doc = parse_document_xml(tei_xml)
+ assert tei_doc.to_legacy_dict() == json_form
- assert teixml2json(tei_xml) == json_form
def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
- teixml2json("this is not XML")
+ parse_document_xml("this is not XML")
with pytest.raises(ValueError):
- teixml2json("<xml></xml>")
+ parse_document_xml("<xml></xml>")
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 3b59883..043c63d 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,41 +1,7 @@
-
-import json
-import pytest
-import responses
-
from sandcrawler.html import extract_fulltext_url
+
def test_extract_fulltext_url():
- resp = extract_fulltext_url("asdf", "asdf")
+ resp = extract_fulltext_url("asdf", b"asdf")
assert resp == {}
-
- resp = extract_fulltext_url(
- "http://dummy-site/",
- b"""<html>
- <head>
- <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
- </head>
- <body>
- <h1>my big article here</h1>
- blah
- </body>
- </html>"""
- )
- assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
- assert resp['technique'] == "citation_pdf_url"
-
- with open('tests/files/plos_one_article.html', 'r') as f:
- resp = extract_fulltext_url(
- "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
- f.read(),
- )
- assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
-
- with open('tests/files/elife_article.html', 'r') as f:
- resp = extract_fulltext_url(
- "https://elifesciences.org/articles/44753",
- f.read(),
- )
- assert resp['pdf_url'] == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
-
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
new file mode 100644
index 0000000..ba4acf1
--- /dev/null
+++ b/python/tests/test_html_ingest.py
@@ -0,0 +1,10 @@
+from sandcrawler.ingest_html import *
+
+
+def test_html_extract_ojs3() -> None:
+
+ with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f:
+ ojs3_html = f.read()
+
+ fulltext = html_extract_body_teixml(ojs3_html)
+ assert fulltext["status"] == "success"
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
new file mode 100644
index 0000000..69bd211
--- /dev/null
+++ b/python/tests/test_html_metadata.py
@@ -0,0 +1,261 @@
+import datetime
+
+import pytest
+
+from sandcrawler.html_metadata import *
+
+
+def test_html_metadata_plos() -> None:
+
+ with open("tests/files/plos_one_article.html", "r") as f:
+ plos_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
+ assert meta is not None
+ assert (
+ meta.title
+ == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ )
+ assert meta.doi == "10.1371/journal.pone.0213978"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
+ assert meta.contrib_names == [
+ "Yang Li",
+ "Tuanjie Wang",
+ "Lin Wang",
+ "Mingjun Sun",
+ "Zhizhong Cui",
+ "Shuang Chang",
+ "Yongping Wu",
+ "Xiaodong Zhang",
+ "Xiaohui Yu",
+ "Tao Sun",
+ "Peng Zhao",
+ ]
+ assert meta.container_name == "PLOS ONE"
+ assert meta.container_abbrev == "PLOS ONE"
+ # "Apr 22, 2019"
+ assert meta.release_date == datetime.date(year=2019, month=4, day=22)
+ assert meta.first_page == "e0213978"
+ assert meta.issue == "4"
+ assert meta.volume == "14"
+ assert meta.container_issn == "1932-6203"
+ assert meta.publisher == "Public Library of Science"
+ assert (
+ meta.raw_references
+ and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"
+ in meta.raw_references
+ )
+ assert meta.release_type == "article-journal"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
+
+
+def test_html_metadata_elife() -> None:
+
+ with open("tests/files/elife_article.html", "r") as f:
+ elife_html = f.read()
+
+ meta = html_extract_biblio(
+ "https://elifesciences.org/articles/44753", HTMLParser(elife_html)
+ )
+ assert meta is not None
+ assert meta.title == "Parallel visual circuitry in a basal chordate"
+ assert meta.doi == "10.7554/eLife.44753"
+ assert meta.contrib_names == [
+ "Matthew J Kourakis",
+ "Cezar Borba",
+ "Angela Zhang",
+ "Erin Newman-Smith",
+ "Priscilla Salas",
+ "B Manjunath",
+ "William C Smith",
+ ]
+ assert meta.container_name == "eLife"
+ # 2019-04-18
+ assert meta.release_date == datetime.date(year=2019, month=4, day=18)
+ assert meta.publisher == "eLife Sciences Publications Limited"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ )
+
+
+def test_html_metadata_peerj() -> None:
+
+ with open("tests/files/peerj_oa_article.html", "r") as f:
+ peerj_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
+ assert meta is not None
+ assert (
+ meta.title
+ == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ )
+ assert meta.doi == "10.7717/peerj.4375"
+ assert meta.contrib_names == [
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
+ ]
+ assert meta.container_name == "PeerJ"
+ # "2018-02-13"
+ assert meta.release_date == datetime.date(year=2018, month=2, day=13)
+ assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url
+
+
+def test_html_metadata_nature() -> None:
+
+ with open("tests/files/nature_article.html", "r") as f:
+ nature_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
+ assert meta is not None
+ assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
+ assert meta.doi == "10.1038/d41586-020-02610-z"
+ assert meta.contrib_names == [
+ "Diana Kwon",
+ ]
+ assert meta.container_name == "Nature"
+ # "2020-09-10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.publisher == "Nature Publishing Group"
+ # note: some error in dublin code in nature HTML resulting in duplication
+ assert (
+ meta.abstract
+ == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ )
+
+
+def test_html_metadata_ojs3() -> None:
+
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
+ ojs3_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
+ assert meta is not None
+ assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
+ assert meta.doi == "10.5210/fm.v25i10.10274"
+ assert meta.contrib_names == [
+ "Calvin Liang",
+ "Jevan Alexander Hutson",
+ "Os Keyes",
+ ]
+ assert meta.container_name == "First Monday"
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_issn == "1396-0466"
+ # "2020/09/10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.lang == "en"
+ assert (
+ meta.abstract
+ == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ )
+ assert (
+ meta.html_fulltext_url
+ == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ )
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_dlib() -> None:
+
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
+ dlib_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
+ assert meta is not None
+ assert meta.doi == "10.1045/may2017-vanhyning"
+ # "2017-05-15"
+ assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
+
+def test_html_metadata_dc_case() -> None:
+ """
+ This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
+ """
+
+ snippet = """
+ <html>
+ <head>
+ <meta name="DC.Citation.Issue" content="123"/>
+ </head>
+ <body>Hi.</body>
+ </html>"""
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(snippet))
+ assert meta is not None
+ assert meta.issue == "123"
+
+
+@pytest.fixture
+def adblock() -> Any:
+ return load_adblock_rules()
+
+
+def test_html_resources(adblock) -> None:
+
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
+ dlib_html = f.read()
+
+ resources = html_extract_resources(
+ "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html",
+ HTMLParser(dlib_html),
+ adblock,
+ )
+
+ assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources
+
+ # check that adblock working
+ for r in resources:
+ assert "/ga.js" not in r["url"]
+
+ with open("tests/files/plos_one_article.html", "r") as f:
+ plos_html = f.read()
+
+ resources = html_extract_resources(
+ "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
+ HTMLParser(plos_html),
+ adblock,
+ )
+
+ # check that custom adblock working
+ for r in resources:
+ assert "crossmark-cdn.crossref.org" not in r["url"]
+
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
+ monday_html = f.read()
+
+ resources = html_extract_resources(
+ "https://firstmonday.org/blah/",
+ HTMLParser(monday_html),
+ adblock,
+ )
+
+ with open("tests/files/elife_article.html", "r") as f:
+ elife_html = f.read()
+
+ resources = html_extract_resources(
+ "https://elife.org/blah/",
+ HTMLParser(elife_html),
+ adblock,
+ )
+
+ with open("tests/files/nature_article.html", "r") as f:
+ nature_html = f.read()
+
+ resources = html_extract_resources(
+ "https://nature.com/blah/",
+ HTMLParser(nature_html),
+ adblock,
+ )
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 33de35d..e14a452 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,12 +1,12 @@
-
import json
+
import pytest
import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
from sandcrawler import *
-from test_wayback import *
-from test_savepagenow import *
-from test_grobid import REAL_TEI_XML
@pytest.fixture
@@ -21,6 +21,7 @@ def ingest_worker(wayback_client, spn_client):
)
return worker
+
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
grobid_client = GrobidClient(
@@ -41,128 +42,223 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client):
@responses.activate
def test_ingest_success(ingest_worker_pdf):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=pdf_bytes)
- responses.add(responses.GET,
- 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=pdf_bytes,
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/grobid?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
status=200,
- body=json.dumps([]))
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
resp = ingest_worker_pdf.process(request)
print(resp)
- assert resp['hit'] == True
- assert resp['status'] == "success"
- assert resp['request'] == request
- assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
- assert type(resp['terminal']['terminal_dt']) == str
- assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
- assert resp['terminal']['terminal_status_code']
- assert type(resp['file_meta']['size_bytes']) == int
- assert resp['file_meta']['mimetype'] == "application/pdf"
- assert resp['cdx']['url'] == TARGET + "/redirect"
- assert 'warc_path' not in resp['cdx']
- assert 'revisit_cdx' not in resp
- assert resp['grobid']['status'] == "success"
- assert resp['grobid']['status_code'] == 200
- assert resp['grobid']['grobid_version']
- assert 'fatcat_release' in resp['grobid']
- assert 'grobid_version' not in resp['grobid']['metadata']
- assert 'fatcat_release' not in resp['grobid']['metadata']
- assert not 'tei_xml' in resp['grobid']
+ assert resp["hit"] is True
+ assert resp["status"] == "success"
+ assert resp["request"] == request
+ assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"]
+ assert type(resp["terminal"]["terminal_dt"]) == str
+ assert resp["terminal"]["terminal_url"] == TARGET + "/redirect"
+ assert resp["terminal"]["terminal_status_code"]
+ assert type(resp["file_meta"]["size_bytes"]) == int
+ assert resp["file_meta"]["mimetype"] == "application/pdf"
+ assert resp["cdx"]["url"] == TARGET + "/redirect"
+ assert "warc_path" not in resp["cdx"]
+ assert "revisit_cdx" not in resp
+ assert resp["grobid"]["status"] == "success"
+ assert resp["grobid"]["status_code"] == 200
+ assert resp["grobid"]["grobid_version"]
+ assert "fatcat_release" in resp["grobid"]
+ assert "grobid_version" not in resp["grobid"]["metadata"]
+ assert "fatcat_release" not in resp["grobid"]["metadata"]
+ assert "tei_xml" not in resp["grobid"]
+ assert resp["pdf_meta"]["status"] == "success"
+ assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1
+ assert resp["pdf_meta"].get("text") is None
+
@responses.activate
def test_ingest_landing(ingest_worker):
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body="<html></html>")
+ body="<html></html>",
+ )
resp = ingest_worker.process(request)
print(resp)
- assert resp['hit'] == False
- assert resp['status'] == "no-pdf-link"
- assert resp['request'] == request
- assert 'terminal' in resp
- assert 'file_meta' not in resp
- assert 'cdx' not in resp
- assert 'revisit_cdx' not in resp
- assert 'grobid' not in resp
+ assert resp["hit"] is False
+ assert resp["status"] == "no-pdf-link"
+ assert resp["request"] == request
+ assert "terminal" in resp
+ assert "file_meta" not in resp
+ assert "cdx" not in resp
+ assert "revisit_cdx" not in resp
+ assert "grobid" not in resp
+
@responses.activate
def test_ingest_blocklist(ingest_worker):
ingest_worker.base_url_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-url-blocklist"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-url-blocklist"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_wall_blocklist(ingest_worker):
+
+ ingest_worker.wall_blocklist = [
+ "://test.fatcat.wiki/",
+ ]
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-wall"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+ assert resp["hit"] is False
+ assert resp["status"] == "blocked-cookie"
+ assert resp["request"] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 429c6b0..9bd8b5f 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -1,4 +1,3 @@
-
"""
This file contains tests to run against "live" wayback services. They default
to "skip" because you need authentication, and we shouldn't hit these services
@@ -7,10 +6,9 @@ automatically in CI.
Simply uncomment lines to run.
"""
-import json
import pytest
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
@pytest.fixture
@@ -18,16 +16,19 @@ def cdx_client():
client = CdxApiClient()
return client
+
@pytest.fixture
def wayback_client():
client = WaybackClient()
return client
+
@pytest.fixture
def spn_client():
client = SavePageNowClient()
return client
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch(cdx_client):
@@ -42,12 +43,16 @@ def test_cdx_fetch(cdx_client):
assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
assert resp.warc_csize == 25338
assert resp.warc_offset == 240665973
- assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ assert (
+ resp.warc_path
+ == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ )
# bogus datetime; shouldn't match
with pytest.raises(KeyError):
resp = cdx_client.fetch(url, "12345678123456")
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_lookup_best(cdx_client):
@@ -66,24 +71,31 @@ def test_cdx_lookup_best(cdx_client):
assert resp.mimetype == "text/html"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_wayback_fetch(wayback_client):
- resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+ resp = wayback_client.fetch_petabox(
+ 25683,
+ 2676464871,
+ "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz",
+ )
assert resp.body
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_resource_success(wayback_client):
url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url in (url, url.replace("https://", "http://"))
assert resp.cdx.url in (url, url.replace("https://", "http://"))
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch_spn2(cdx_client):
@@ -104,9 +116,9 @@ def test_cdx_fetch_spn2(cdx_client):
# https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
- #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
datetime = "20200110222410"
@@ -117,6 +129,7 @@ def test_cdx_fetch_spn2(cdx_client):
assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_ftp(wayback_client):
# ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
@@ -127,29 +140,30 @@ def test_lookup_ftp(wayback_client):
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
- assert resp.terminal_status_code == 226
+ assert resp.terminal_status_code in (226, 200)
assert resp.cdx.url == url
assert resp.revisit_cdx
assert resp.revisit_cdx.url != url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
# not revisit?
url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
- assert resp.terminal_status_code == 226
+ assert resp.terminal_status_code in (226, 200)
assert resp.cdx.url == url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_crawl_ftp(spn_client, wayback_client):
@@ -158,10 +172,10 @@ def test_crawl_ftp(spn_client, wayback_client):
resp = spn_client.crawl_resource(url, wayback_client)
# FTP isn't supported yet!
- #assert resp.hit == True
- #assert resp.status == "success"
- #assert resp.terminal_url == url
- #assert resp.cdx.url == url
+ # assert resp.hit is True
+ # assert resp.status == "success"
+ # assert resp.terminal_url == url
+ # assert resp.cdx.url == url
- assert resp.hit == False
+ assert resp.hit is False
assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 29f9e9f..2bad851 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,77 +1,110 @@
-
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_line,
+)
+
def test_gen_file_metadata():
-
+
# valid (but very small) PDF file
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
- b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
- assert fm['mimetype'] == 'text/html'
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
+ )
+ assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
- assert fm['mimetype'] == 'text/plain'
- assert fm['size_bytes'] == 8
+ assert fm["mimetype"] == "text/plain"
+ assert fm["size_bytes"] == 8
+
+
+def test_gen_file_metadata_path():
+
+ # valid (but very small) PDF file
+ file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
+ assert file_meta == {
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
+ }
+
def test_b32_hex():
# valid b32
- assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert (
+ b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
+ assert (
+ b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
# sha1hex pass-through
- s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
- assert b32_hex('blah') == 'blah'
+ assert b32_hex("blah") == "blah"
+
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
- 'mimetype': "application/pdf",
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'datetime': "20170828233154",
- 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'warc_offset': 931661233,
- 'warc_csize': 210251,
- 'http_status': 200,
+ "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ "mimetype": "application/pdf",
+ "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "datetime": "20170828233154",
+ "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "warc_offset": 931661233,
+ "warc_csize": 210251,
+ "http_status": 200,
}
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) is None
+
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
- assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
- "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
-
+ assert (
+ clean_url(
+ "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
+ == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 2c54c85..9d75655 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,61 +1,71 @@
+import struct
+import poppler
import pytest
-import struct
-import responses
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
from sandcrawler.pdfextract import process_pdf
-from test_wayback import wayback_client, cdx_client
-
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
def test_process_fake_pdf():
resp = process_pdf(FAKE_PDF_BYTES)
print(resp)
assert resp.status == "not-pdf"
+ with open("tests/files/dummy_zip.zip", "rb") as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == "not-pdf"
+
+
+@pytest.mark.skipif(
+ poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
+)
def test_process_dummy_pdf():
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'success'
+ assert resp.status == "success"
assert resp.page0_thumbnail is not None
assert len(resp.text) > 10
assert resp.meta_xml is None
- assert resp.file_meta['mimetype'] == 'application/pdf'
+ assert resp.file_meta["mimetype"] == "application/pdf"
print(resp.pdf_info)
print(resp.pdf_extra)
- assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
+ assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
# 595 x 842
- assert resp.pdf_extra['page0_height'] == 842
- assert resp.pdf_extra['page0_width'] == 595
- assert resp.pdf_extra['page_count'] == 1
+ assert resp.pdf_extra["page0_height"] == 842
+ assert resp.pdf_extra["page0_width"] == 595
+ assert resp.pdf_extra["page_count"] == 1
-def test_pdfextract_worker_cdx(wayback_client):
+
+def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
def test_pdfextract_blob_worker():
sink = BlackholeSink()
worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
worker.process(pdf_bytes)
-
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 52f26c0..ed17d24 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,7 +1,4 @@
-
-import pytest
-
-from sandcrawler.workers import CdxLinePusher, BlackholeSink
+from sandcrawler.workers import BlackholeSink, CdxLinePusher
def test_cdx_line_pusher():
@@ -9,20 +6,24 @@ def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['pushed'] == 19
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["pushed"] == 19
# HTTP 200 and application/pdf
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(sink, cdx_file,
- filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ sink,
+ cdx_file,
+ filter_mimetypes=["application/pdf"],
+ filter_http_statuses=[200, 226],
+ )
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['skip-http_status'] == 10
- assert counts['skip-mimetype'] == 2
- assert counts['pushed'] == 7
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["skip-http_status"] == 10
+ assert counts["skip-mimetype"] == 2
+ assert counts["pushed"] == 7
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 63dd887..add2c60 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,11 +1,10 @@
-
import json
+
import pytest
import responses
-
-from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
from test_wayback import *
+from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError
TARGET = "http://dummy-target.dummy"
JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
@@ -16,7 +15,7 @@ PENDING_BODY = {
"https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
"https://cdn.onesignal.com/sdks/OneSignalSDK.js",
- ]
+ ],
}
SUCCESS_BODY = {
"status": "success",
@@ -58,12 +57,12 @@ SUCCESS_BODY = {
"https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
"https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
"https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
- "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
+ "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
],
- "outlinks":{
+ "outlinks": {
"https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
- "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
- }
+ "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695",
+ },
}
ERROR_BODY = {
"status": "error",
@@ -71,13 +70,38 @@ ERROR_BODY = {
"status_ext": "error:invalid-host-resolution",
"job_id": JOB_ID,
"message": "Couldn't resolve host for http://example5123.com.",
- "resources": []
+ "resources": [],
}
CDX_SPN_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180326070330",
+ TARGET + "/redirect",
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz",
+ ],
]
+
@pytest.fixture
def spn_client():
client = SavePageNowClient(
@@ -88,112 +112,216 @@ def spn_client():
client.poll_seconds = 0.0
return client
+
@responses.activate
def test_savepagenow_success(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
+ body=json.dumps(SUCCESS_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 4
+ assert len(responses.calls) == 5
- assert resp.success == True
+ assert resp.success is True
assert resp.status == "success"
assert resp.request_url == TARGET
assert resp.terminal_url == TARGET + "/redirect"
- assert resp.terminal_dt == SUCCESS_BODY['timestamp']
- assert resp.resources == SUCCESS_BODY['resources']
+ assert resp.terminal_dt == SUCCESS_BODY["timestamp"]
+ assert resp.resources == SUCCESS_BODY["resources"]
+
@responses.activate
def test_savepagenow_remote_error(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(ERROR_BODY))
+ body=json.dumps(ERROR_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 3
+ assert len(responses.calls) == 4
- assert resp.success == False
- assert resp.status == ERROR_BODY['status_ext']
+ assert resp.success is False
+ assert resp.status == ERROR_BODY["status_ext"]
assert resp.request_url == TARGET
- assert resp.terminal_url == None
- assert resp.terminal_dt == None
- assert resp.resources == None
+ assert resp.terminal_url is None
+ assert resp.terminal_dt is None
+ assert resp.resources is None
+
@responses.activate
def test_savepagenow_500(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=500,
- body=json.dumps(ERROR_BODY))
+ body=json.dumps(ERROR_BODY),
+ )
with pytest.raises(SavePageNowError):
- resp = spn_client.save_url_now_v2(TARGET)
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 3
+
+
+@responses.activate
+def test_savepagenow_no_slots(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 0,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+
+ with pytest.raises(SavePageNowBackoffError):
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 1
- assert len(responses.calls) == 2
@responses.activate
def test_crawl_resource(spn_client, wayback_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
- print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
+ print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"))
resp = spn_client.crawl_resource(TARGET, wayback_client)
- assert len(responses.calls) == 5
+ assert len(responses.calls) == 6
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.body == WARC_BODY
assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
@@ -201,4 +329,3 @@ def test_crawl_resource(spn_client, wayback_client):
assert type(resp.cdx) == CdxPartial
with pytest.raises(AttributeError):
print(resp.cdx.warc_path)
-
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6bc1ca4..da4dfd8 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,36 +1,156 @@
-
import json
+
import pytest
import responses
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
-
+from sandcrawler import CdxApiClient, WaybackClient
CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
]
CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner, but not right mimetype
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner and mimetype, but wrong status code
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # "best"
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # older
- ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner, but not right mimetype
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner and mimetype, but wrong status code
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # "best"
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # older
+ [
+ "wiki,fatcat)/",
+ "20180712220054",
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
]
+
@pytest.fixture
def cdx_client():
client = CdxApiClient(
@@ -39,13 +159,13 @@ def cdx_client():
)
return client
+
@responses.activate
def test_cdx_fetch(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
@@ -58,16 +178,16 @@ def test_cdx_fetch(cdx_client):
assert resp.warc_offset == 108062304
assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
@responses.activate
def test_cdx_fetch_errors(cdx_client):
with pytest.raises(ValueError):
resp = cdx_client.fetch(CDX_TARGET, "2019")
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -77,14 +197,15 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
+ assert resp
+
@responses.activate
def test_cdx_lookup_best(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
@@ -95,6 +216,7 @@ def test_cdx_lookup_best(cdx_client):
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
WARC_TARGET = "http://fatcat.wiki/"
WARC_BODY = b"""
<html>
@@ -108,6 +230,7 @@ WARC_BODY = b"""
</html>
"""
+
@pytest.fixture
def wayback_client(cdx_client, mocker):
client = WaybackClient(
@@ -127,10 +250,11 @@ def wayback_client(cdx_client, mocker):
return client
+
@pytest.fixture
def wayback_client_pdf(cdx_client, mocker):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
client = WaybackClient(
@@ -150,6 +274,7 @@ def wayback_client_pdf(cdx_client, mocker):
return client
+
@responses.activate
def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
@@ -159,14 +284,14 @@ def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
assert resp == WARC_BODY
+
@responses.activate
def test_lookup_resource_success(wayback_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = wayback_client.lookup_resource(CDX_TARGET)
- assert resp.hit == True
+ assert resp.hit is True
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
new file mode 100644
index 0000000..786f863
--- /dev/null
+++ b/python/tests/test_xml.py
@@ -0,0 +1,17 @@
+import pytest
+
+from sandcrawler.xml import xml_reserialize
+
+
+def test_xml_reserialize() -> None:
+
+ with open("tests/files/scielo_article.jats.xml", "rb") as f:
+ raw_xml = f.read()
+
+ assert b'encoding="ISO-8859-1"' in raw_xml
+ raw_xml.decode("ISO-8859-1")
+ with pytest.raises(UnicodeDecodeError):
+ raw_xml.decode("utf-8")
+
+ str_xml = xml_reserialize(raw_xml)
+ assert 'encoding="UTF-8"' in str_xml
diff --git a/python/title_slug_blacklist.txt b/python/title_slug_denylist.txt
index 5bca386..5bca386 120000
--- a/python/title_slug_blacklist.txt
+++ b/python/title_slug_denylist.txt