aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/.coveragerc1
-rw-r--r--python/.flake821
-rw-r--r--python/.gitignore12
-rw-r--r--python/Makefile8
-rw-r--r--python/Pipfile29
-rw-r--r--python/Pipfile.lock2247
-rw-r--r--python/README.md46
-rw-r--r--python/TODO7
-rw-r--r--python/example.env1
-rwxr-xr-xpython/grobid2json.py215
-rwxr-xr-xpython/grobid_tool.py149
-rwxr-xr-xpython/ia_pdf_match.py93
-rwxr-xr-xpython/ingest_file.py100
-rwxr-xr-xpython/ingest_tool.py244
-rwxr-xr-xpython/pdfextract_tool.py110
-rwxr-xr-xpython/pdftrio_tool.py110
-rwxr-xr-xpython/persist_tool.py234
-rw-r--r--python/pyproject.toml7
-rw-r--r--python/pytest.ini5
-rw-r--r--python/sandcrawler/__init__.py59
-rw-r--r--python/sandcrawler/db.py549
-rw-r--r--python/sandcrawler/fileset_platforms.py832
-rw-r--r--python/sandcrawler/fileset_strategies.py387
-rw-r--r--python/sandcrawler/fileset_types.py74
-rw-r--r--python/sandcrawler/grobid.py358
-rw-r--r--python/sandcrawler/html.py307
-rw-r--r--python/sandcrawler/html_metadata.py615
-rw-r--r--python/sandcrawler/ia.py941
-rw-r--r--python/sandcrawler/ingest.py754
-rw-r--r--python/sandcrawler/ingest_file.py925
-rw-r--r--python/sandcrawler/ingest_fileset.py516
-rw-r--r--python/sandcrawler/ingest_html.py (renamed from python/sandcrawler/html_ingest.py)218
-rw-r--r--python/sandcrawler/minio.py45
-rw-r--r--python/sandcrawler/misc.py205
-rw-r--r--python/sandcrawler/pdfextract.py219
-rw-r--r--python/sandcrawler/pdftrio.py100
-rw-r--r--python/sandcrawler/persist.py701
-rw-r--r--python/sandcrawler/workers.py418
-rw-r--r--python/sandcrawler/xml.py1
-rwxr-xr-xpython/sandcrawler_worker.py277
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py70
-rwxr-xr-xpython/scripts/archiveorg_fileset.py135
-rwxr-xr-xpython/scripts/cdx_collection.py82
-rwxr-xr-xpython/scripts/covid2ingestrequest.py79
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py94
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py193
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py182
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py87
-rwxr-xr-xpython/scripts/enrich_scored_matches.py19
-rwxr-xr-xpython/scripts/fetch_cdx_sha1hex.py170
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py111
-rwxr-xr-xpython/scripts/filter_groupworks.py48
-rwxr-xr-xpython/scripts/filter_scored_matches.py49
-rwxr-xr-xpython/scripts/grobid_affiliations.py37
-rwxr-xr-xpython/scripts/import_grobid_metadata.py69
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py36
-rwxr-xr-xpython/scripts/manifest_converter.py7
-rwxr-xr-xpython/scripts/oai2ingestrequest.py112
-rwxr-xr-xpython/scripts/pdf_thumbnail.py15
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py80
-rw-r--r--python/tests/files/crossref_api_work_978-3-030-64953-1_4.json1
-rw-r--r--python/tests/files/crossref_api_work_s1047951103000064.json1
-rw-r--r--python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml66
-rw-r--r--python/tests/files/grobid_refs_s1047951103000064.tei.xml499
-rw-r--r--python/tests/files/small.json7
-rw-r--r--python/tests/test_grobid.py199
-rw-r--r--python/tests/test_grobid2json.py26
-rw-r--r--python/tests/test_html.py28
-rw-r--r--python/tests/test_html_ingest.py10
-rw-r--r--python/tests/test_html_metadata.py106
-rw-r--r--python/tests/test_ingest.py257
-rw-r--r--python/tests/test_live_wayback.py54
-rw-r--r--python/tests/test_misc.py99
-rw-r--r--python/tests/test_pdfextract.py51
-rw-r--r--python/tests/test_pushers.py33
-rw-r--r--python/tests/test_savepagenow.py265
-rw-r--r--python/tests/test_wayback.py195
-rw-r--r--python/tests/test_xml.py5
78 files changed, 11007 insertions, 4710 deletions
diff --git a/python/.coveragerc b/python/.coveragerc
index 67053a7..51038d6 100644
--- a/python/.coveragerc
+++ b/python/.coveragerc
@@ -2,4 +2,3 @@
omit = tests/*
source =
sandcrawler
- grobid2json
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000..c7ef5fe
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,21 @@
+[flake8]
+select = C,E,F,W,ANN
+# ANN003 is annotation on, eg, **kwargs
+# ANN101 is annotation on 'self' (why would that be wanted?)
+# ANN204 is annotation on '__init__()'
+# ANN401 is 'Any' type
+# E265,E266 are restrictions on comments ('#')
+# E501 is line-too-long, which we enforce with black
+# W503,E203 are allowed by black
+# TODO: C901 is complexity, should be re-enabled at some point
+ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203
+per-file-ignores =
+ sandcrawler/__init__.py: F401
+ sandcrawler/ia.py: E402
+ tests/*.py: ANN201,ANN001,F403,F405
+ # TODO: add more annotations to CLI scripts
+ *_tool.py,sandcrawler_worker.py: ANN201,ANN001,ANN202,ANN206,ANN205,F403,F405
+ scripts: ANN201,ANN001,ANN202,ANN206,ANN205
+exclude = .git,__pycache__,.venv,scripts/
+max-line-length = 96
+max-complexity = 30
diff --git a/python/.gitignore b/python/.gitignore
index 9b0fdb2..a5a773e 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,4 +1,14 @@
*part-000*
*.tar.gz
-*.tsv.gz
+*.gz
htmlcov/
+samples/
+*.json
+TODO*
+*.tsv
+
+!.flake8
+!.gitlab-ci.yml
+!.pylintrc
+!.coveragerc
+!.gitignore
diff --git a/python/Makefile b/python/Makefile
index 9593ab4..940a7eb 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -14,14 +14,14 @@ deps: ## Install dependencies using pipenv
.PHONY: lint
lint: ## Run lints (eg, flake8, mypy)
- #pipenv run flake8 . --exit-zero
- pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero
+ pipenv run flake8 . --exit-zero
+ pipenv run isort -q -c . || true
pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
- pipenv run pylint --rcfile=.pylintrc -E --jobs=4 sandcrawler tests *.py
.PHONY: fmt
fmt: ## Run code formating on all source code
- pipenv run black *.py sandcrawler/ tests/
+ pipenv run isort --atomic .
+ pipenv run black --line-length 96 sandcrawler/ tests/ scripts/ *.py
.PHONY: test
test: ## Run all tests and lints
diff --git a/python/Pipfile b/python/Pipfile
index bb3f180..b841755 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,6 +1,6 @@
[[source]]
name = "ia"
-url = "https://devpi.archive.org/wb/prod"
+url = "https://devpi.us.archive.org/wb/prod"
verify_ssl = true
[[source]]
@@ -20,38 +20,41 @@ ipython = "*"
mypy = "*"
flake8 = "*"
flake8-annotations = "*"
-
-# pytype is failing to install on xenial VMs
-#pytype = "*"
+isort = "*"
+types-requests = "*"
+types-beautifulsoup4 = "*"
+types-dateparser = "*"
+types-psycopg2 = "*"
+types-Pillow = "*"
+black = "*"
[packages]
requests = ">=2"
-raven = {extras = ['flask'],version = "*"}
confluent-kafka = "*"
python-snappy = "*"
boto3 = "*"
-minio = "*"
+minio = "<7.0.0"
psycopg2 = "*"
bs4 = "*"
python-magic = "*"
ftfy = "*"
internetarchive = "*"
-Flask = ">=1"
urlcanon = "*"
-pillow = ">=3"
+Pillow = ">=3"
python-poppler = ">=0.2.1"
selectolax = ">=0.2"
-trafilatura = "*"
+# constraining trafilatura to prevent a version conflict with
+# `charset_normalizer`, between htmldate and requests
+trafilatura = ">=1,<1.4"
+htmldate= ">=1,<1.4"
pydantic = ">=1.7"
dateparser = "*"
braveblock = "*"
dynaconf = ">=3"
sentry-sdk = { version = ">=0.14.0", extras = [] }
zstandard = "*"
-
-# must lock black to an exact version because it is still "beta"
-# see: https://github.com/psf/black/issues/517
-black = "==19.10b0"
+grobid_tei_xml = ">=0.1.2,<0.2.0"
+PyMuPDF = ">=1.19.0,<1.20.0"
[requires]
python_version = "3.8"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 3551b54..546a420 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "88f69c9f6bcc19909238c16af9cb7101959139fc1b6012af31f17900c89e3aea"
+ "sha256": "35d0f0cd2f3903cce19d5a73f50a89ba09a1b43abbda84894fd45411d7f32760"
},
"pipfile-spec": 6,
"requires": {
@@ -10,7 +10,7 @@
"sources": [
{
"name": "ia",
- "url": "https://devpi.archive.org/wb/prod",
+ "url": "https://devpi.us.archive.org/wb/prod",
"verify_ssl": true
},
{
@@ -21,113 +21,161 @@
]
},
"default": {
- "appdirs": {
- "hashes": [
- "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41",
- "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"
- ],
- "version": "==1.4.4"
- },
- "attrs": {
- "hashes": [
- "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
- "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
- ],
- "version": "==20.3.0"
- },
- "backports.csv": {
- "hashes": [
- "sha256:1277dfff73130b2e106bf3dd347adb3c5f6c4340882289d88f31240da92cbd6d",
- "sha256:21f6e09bab589e6c1f877edbc40277b65e626262a86e69a70137db714eaac5ce"
- ],
- "version": "==1.0.7"
+ "async-timeout": {
+ "hashes": [
+ "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15",
+ "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.2"
+ },
+ "backports.zoneinfo": {
+ "hashes": [
+ "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
+ "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
+ "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
+ "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
+ "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
+ "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
+ "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
+ "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
+ "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
+ "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
+ "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
+ "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
+ "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
+ "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
+ "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
+ "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
+ ],
+ "markers": "python_version < '3.9' and python_version >= '3.6' and python_version < '3.9'",
+ "version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
- "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
- "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
- "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
- ],
- "version": "==4.9.3"
- },
- "black": {
- "hashes": [
- "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b",
- "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"
- ],
- "index": "ia",
- "version": "==19.10b0"
- },
- "blinker": {
- "hashes": [
- "sha256:471aee25f3992bd325afa3772f1063dbdbbca947a041b8b89466dc00d606f8b6"
+ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
+ "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
],
- "version": "==1.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.11.1"
},
"boto3": {
"hashes": [
- "sha256:ba8de10d3ede338d51ae47e428b97dcc1d1b507741aa98697e63e879a147f4aa",
- "sha256:e3f10ed6d9ca98415fdec15c85e50a89ec38d6229bce3fafd5e7965b16c4ebc5"
+ "sha256:7a6766c7177a9c6f85365e02aabd96ca4d72e08bc5cb127cb51b0a97ac9b9d1b",
+ "sha256:82b790b1dabd0746b028d2013b5d4d636a41f3aaf25520081f4c173cb6eb395d"
],
"index": "ia",
- "version": "==1.16.44"
+ "version": "==1.26.37"
},
"botocore": {
"hashes": [
- "sha256:4ff05bc089ba78a5996f06dcfddf8ca51583e30ce779ed95e9952e90c1907420",
- "sha256:7725e08c95ae96c4dbd955cb4ae44a0c06d3e41f76a7feb0a941c27a44c63113"
+ "sha256:18ab8e95345a6d0d2653ce65d261a0aef6fef8a57a35a89e3cea6ffe315e92fc",
+ "sha256:3afa4fec9f7713caa05116563b38f81bec7bd20585d517155484d3f25efab5aa"
],
- "version": "==1.19.44"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.29.37"
},
"braveblock": {
"hashes": [
- "sha256:02e630baa202d0294702bf0274033abe27ee1f17a40c8e2651594f533db645e3",
- "sha256:1a0a7aff2c36745f60ad2e08be050c6c9b2e5e1eeff4e6d5cf4ea02de8a5eacc",
- "sha256:1bae851fbe20f94055402fb8b854ce7ed393336808bd85c65c36d03e360a9aef",
- "sha256:2a5648640ddb063bfd0987142b678ddf078ac3f6081a91f13cc80ad3c876944f",
- "sha256:2fe6bce6ae47449724be267b4ba8d76d4c897091dda5c51c6aaab242e004118d",
- "sha256:5bad6da1d03cd32ba238055f77a45d2cdb4cf6d2bb06e9c9eb7c0c050d24dd4c",
- "sha256:5eb9eaf9f9d98e787f6473d2b6f4f3420389ded86e68352487e403b57793a7fe",
- "sha256:784be89c29ffedfbde1b4748ccb9c678d16d79c839791fc49e4947051f47ceb0",
- "sha256:a47e34ba4114c3a63b2b2f4cd6bb2a596cfbd5ecee00efb96924b39a84aa080f",
- "sha256:c9a8d4c649ececf2d3d7bcfa11a9d35d68568ac2bbaa2edb9807f8ceeafdc762",
- "sha256:cf54faf4de8bd123e64c85800fde5515b45c265ea5b260cb198233bd83b6c4d9",
- "sha256:f74cc53561827368cf6534004cd05f83e39fdd1d1714b63b9bd7919b299f970f"
+ "sha256:0bfca14473275366f2f822751c4e8dde7f94ee5ce8a9372244870452458f4fe1",
+ "sha256:107050b2e1c885b748727573a54a85d2e1ea9ad86146370f6eb79ca18b9673d4",
+ "sha256:13f9769eac9c4027eba2f400e635572796f7a7feb343f442d13c4b78e7d6f536",
+ "sha256:14efeada36418525da7c3b26393041b85242ffa1165328ec7eaf9b9780b72d62",
+ "sha256:1ab6980d10b8a02fd0dc73e28f18a0a3e17be636d314c1fdaa3bbb3e36a81f0f",
+ "sha256:45286418a43a3dfab50bdaf922f5003dbd2c3d1f696d23883568f4fa14b8093e",
+ "sha256:66c2442154102bff8df9c6f05cb72cd5cda6f4e1ed88592800ab1b6e8100e806",
+ "sha256:73de4f925ae5442d3361a71d7c0eeb1b4c540bf3d0c91100a00325ccef9e743c",
+ "sha256:80cbeeb6d083bc2a9106214188e5ce05362f248c1051344dc6673b7b38a561da",
+ "sha256:8460b10c9b82cc9d0b6056e1fe206bea209fe5a83ba87bdf9486305657224a44",
+ "sha256:903c506fc05eb6b76e4d31f957c1118078582db80f8ef5ce5ac74418f094d498",
+ "sha256:dcb773e3e275de896efebe57159a67587283d6ca1d1a36695170a3756fd2ef3a"
],
"index": "ia",
- "version": "==0.1.9"
+ "version": "==0.3.0"
},
"brotli": {
"hashes": [
+ "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019",
+ "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df",
+ "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
"sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
"sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
+ "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
"sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
"sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
"sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
+ "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
+ "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
+ "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
+ "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be",
+ "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be",
+ "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a",
+ "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
"sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
"sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7",
+ "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad",
+ "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679",
"sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
+ "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
+ "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
+ "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
"sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
"sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
"sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
"sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
"sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337",
"sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
"sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
+ "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
"sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
"sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
"sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
+ "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7",
+ "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
+ "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
"sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
+ "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
+ "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
+ "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
+ "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
"sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
+ "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
+ "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f",
"sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
+ "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
+ "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
"sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
+ "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
"sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
+ "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
"sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde",
+ "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
+ "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f",
+ "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8",
"sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
+ "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d",
"sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
"sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755",
+ "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a",
+ "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d",
+ "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a",
+ "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
"sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
"sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
+ "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
+ "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a",
+ "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
+ "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
+ "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
+ "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
+ "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
"sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
],
"version": "==1.0.9"
@@ -141,98 +189,141 @@
},
"certifi": {
"hashes": [
- "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
- "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2020.12.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
"chardet": {
"hashes": [
- "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
- "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5",
+ "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"
],
- "version": "==4.0.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.1.0"
},
- "click": {
+ "charset-normalizer": {
"hashes": [
- "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
- "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==7.1.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
"configparser": {
"hashes": [
- "sha256:005c3b102c96f4be9b8f40dafbd4997db003d07d1caa19f37808be8031475f2a",
- "sha256:08e8a59ef1817ac4ed810bb8e17d049566dd6e024e7566f6285c756db2bb4ff8"
+ "sha256:8be267824b541c09b08db124917f48ab525a6c3e837011f3130781a224c57090",
+ "sha256:b065779fd93c6bf4cee42202fa4351b4bb842e96a3fb469440e484517a49b9fa"
],
- "version": "==5.0.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.3.0"
},
"confluent-kafka": {
"hashes": [
- "sha256:00acc73f7d49961bf427f5e4fd6c0a220a6bfa5ccc91e0ad1f9ffa1751a169b0",
- "sha256:0a59afbb90bdd22b9acdd3bb134f5ee1dff3cc5df55eaf52bf97b2f8d0d00de3",
- "sha256:13b0e2011560f461ff39daf38089dd7f91404b3e66dba0456ccce0700f93c4f2",
- "sha256:175c7064c8f19975616974558c45f42c147a202d4b1c0b0a83afefb920367696",
- "sha256:22d7201d1aa89f1c5546749e781492925ed3eb0d7bd8f781fc57294cd45ddde3",
- "sha256:3034cacc3b0d03eb3ce39cc5a64c1070d223870246f5d90c9113996be9db7df8",
- "sha256:3e2d4f55ca952aeada3831d6615dc13a8a42c8e97175855ca08bbc6e6091b080",
- "sha256:5a1c47320d6afc5b2599f8f8e143aed6845a2d903facde984606e02f10f11221",
- "sha256:7b03bd9cc7b5e4df0a27eed359762c61a35313d4981ef1d9b418069eee454e66",
- "sha256:85ff4823770ce2efaabb46d88e5ae26a840e0051fd481abaa805f21a5a84d003",
- "sha256:9534cd2c0313df75b70eb4cf729382998970d97bbdda5cf3aef7081b855ccebe",
- "sha256:99b13d0957a5967c85aee6138ef5f9acec90294267a549c5683744f20cf5d7b4",
- "sha256:9a1c77291c1ac4b991aa0358f2f44636686eb8f52fb628502d30c312160a14e9",
- "sha256:9ac812006000887f76c95b8a33a9f0b65845bf072fbc54a42a1acffd34e41120",
- "sha256:9c47b8aacfe347bffd86bf75b98626718912b63df87f256dff1abc06a0355410",
- "sha256:a116382ae67e0d6a54684bab4ee9b1be54e789d031a6e5e74c3edc657c79d23c",
- "sha256:b1c89f3653385acc5da71570e03281f35ac6960367f2b2a426ae431deb1a1a35",
- "sha256:bb77276d569f511abe4a5b32a53f8a30285bc7be68219e5711a44720bf356ac2",
- "sha256:bbd9633552840ab9367fb762ea21272759db8caec2c34ff16ee28be177644cdf",
- "sha256:bfdfa81e4e72d2c24e408a5e199aae0a477499ae40647dfa6906d002d9b07f38",
- "sha256:c7461d6db081c23a6d38ceba348e7c178d7e974cf22c45ba8a4918ecb8855a44",
- "sha256:d6a5d4c72360a75e875e88f7cce42b66a786d037ca2002303ab1c580d49caf53",
- "sha256:dabed41cc60d1fc6d3cb44a90fe02e5192c9bf0f73c7b35761981e62ecabc592",
- "sha256:dd544847c713eeeb525031348ff6ffea4ecdd11c13590893e599a9d4676a9bd4",
- "sha256:eba169a9de8c978c9f33c763857c5279eceac46a4fd55a381c2528b9d4b3359e",
- "sha256:f2d1ee0bfdf618017bbfaa42406546155c1a86263e4f286295318578c723803b"
+ "sha256:24872e3e427b16f77461ae7e6cf48f9c5c03c06884ac51bad179580a4dd29145",
+ "sha256:2fb97bd25d436bd59fe079885aa77a3a2f23cface9c6359d4700053665849262",
+ "sha256:3207c76d1510571cbda85560c293dec5f8d6645103b3f471abab5c83e51a7ccd",
+ "sha256:344a7fec57d3348002392a7bd5cf66fb9dbe4a103e81636037cccd6fff944e28",
+ "sha256:382739e499deaf488459c2307ebcc0e9b3653340801d6053c207c84ad710ee8d",
+ "sha256:4d6bfcc352cd608fcf325037b4425c0edaeae0c6a5439423a865110b59f897e9",
+ "sha256:4f27ddf7daf630a95e1d7dfddd0c8cf8a7755c9567dc9851bf2e74c22f34af42",
+ "sha256:5b24587b30a4d288a7b1c5cc756ee707fc1293fa28454f8db40267ed9d7e73c8",
+ "sha256:6ab745babc33a864e3ca3a2659c005ed52503e39936fff5812eeb21920009c8b",
+ "sha256:7e6592533b3f8cfbc086ea2d472058f10e5f6a04a388edca01773285c63284b4",
+ "sha256:b9ad6ad9d58c2735129f94f044b2236d7de87d77a101c8c630363add12d62a4a",
+ "sha256:be7b37020f614017d8a047565b3fb61ceef9c30a9ee093f9373d06a4c32068ae",
+ "sha256:bef263b6d78a3e63399e1b82cc07cbb30af762884df96a369cba0e1011405344",
+ "sha256:c4b7c4d0b647952d2b506948131d6e7e1c42ccb16aac8e3e52369c16b94e7215",
+ "sha256:d036bf5e1d7cb3743125d7caf62b1a23b12e403de240144b6117ddbb8f815a33",
+ "sha256:d0cbf8e7510497afd651e134bccb9d579aa90234e45734046fcb6b752d2ee312",
+ "sha256:d533ea0e527122f177943ee35eb356b8d9f7af35fe357e0cdc0514d95804aaea",
+ "sha256:e41b9313c44f54a3cd29b0e95fa32a8e685edaa9287b338f59530b21ebc0b453",
+ "sha256:e9107767cc9240cbf9b5c0fdded5eeead86a1690d1c15de6cbbdcc9d7e3b1962",
+ "sha256:f96033c335da26ea1716ab9adfce459c211b023ca09528f958fb28bf099fc0df",
+ "sha256:f970a2c6d22c934ea68d645abcc96056ecb107489f28a38b2171f65655b7e41f",
+ "sha256:fe31b3b6930d67380df371f5088950f93da5fac580cde3bedb35f992b2498e1b",
+ "sha256:ff08b9f978f8b37f2961614a68f9fdb4fabd10cdd940234e80200806d93a1c30",
+ "sha256:ff4d1557b7fb72e752c36205a344863b8f4f23b3a834780fc36eb7ebde614de7"
],
"index": "ia",
- "version": "==1.5.0"
+ "version": "==1.9.2"
},
"contextlib2": {
"hashes": [
- "sha256:01f490098c18b19d2bd5bb5dc445b2054d2fa97f09a4280ba2c5f3c394c8162e",
- "sha256:3355078a159fbb44ee60ea80abd0d87b80b78c248643b49aa6d94673b413609b"
+ "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f",
+ "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"
],
- "version": "==0.6.0.post1"
+ "markers": "python_version >= '3.6'",
+ "version": "==21.6.0"
},
"courlan": {
"hashes": [
- "sha256:16b22e6b98838469793ce6c4b9501d7a7eff679c227a4d3c135349d1da12f623",
- "sha256:649756066671c1fdcbef129766300aa1b1c5b2cf5bcdedcb0aadcd7f09cd5e6b"
+ "sha256:d06c5b048b2b5cd5c0ac77304dc24b795e4bb257a7b6077ea405a3b5e99ae179",
+ "sha256:d141d30f8e52d344cf9904aa29e4d8750e934026bdbca2dc7bd58b750566d058"
],
- "version": "==0.2.3"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
"crawllib": {
"hashes": [
- "sha256:a3ad99463da04a69a6429e994d425c0144bdda473fbba8743127a3fc2811abea"
+ "sha256:9a30a10318dc706f1e27ff0af950ac14a77f73c18d329771f44d872fd63630e3"
],
- "version": "==0.1.4.8"
+ "version": "==0.1.6"
},
- "cssselect": {
- "hashes": [
- "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
- "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
- ],
- "version": "==1.1.0"
+ "cython": {
+ "hashes": [
+ "sha256:061e25151c38f2361bc790d3bcf7f9d9828a0b6a4d5afa56fbed3bd33fb2373a",
+ "sha256:06be83490c906b6429b4389e13487a26254ccaad2eef6f3d4ee21d8d3a4aaa2b",
+ "sha256:07d173d3289415bb496e72cb0ddd609961be08fe2968c39094d5712ffb78672b",
+ "sha256:0bbc27abdf6aebfa1bce34cd92bd403070356f28b0ecb3198ff8a182791d58b9",
+ "sha256:0ea8267fc373a2c5064ad77d8ff7bf0ea8b88f7407098ff51829381f8ec1d5d9",
+ "sha256:3875c2b2ea752816a4d7ae59d45bb546e7c4c79093c83e3ba7f4d9051dd02928",
+ "sha256:39afb4679b8c6bf7ccb15b24025568f4f9b4d7f9bf3cbd981021f542acecd75b",
+ "sha256:3f85eb2343d20d91a4ea9cf14e5748092b376a64b7e07fc224e85b2753e9070b",
+ "sha256:40eff7aa26e91cf108fd740ffd4daf49f39b2fdffadabc7292b4b7dc5df879f0",
+ "sha256:479690d2892ca56d34812fe6ab8f58e4b2e0129140f3d94518f15993c40553da",
+ "sha256:4a4b03ab483271f69221c3210f7cde0dcc456749ecf8243b95bc7a701e5677e0",
+ "sha256:513e9707407608ac0d306c8b09d55a28be23ea4152cbd356ceaec0f32ef08d65",
+ "sha256:5514f3b4122cb22317122a48e175a7194e18e1803ca555c4c959d7dfe68eaf98",
+ "sha256:5ba622326f2862f9c1f99ca8d47ade49871241920a352c917e16861e25b0e5c3",
+ "sha256:63b79d9e1f7c4d1f498ab1322156a0d7dc1b6004bf981a8abda3f66800e140cd",
+ "sha256:656dc5ff1d269de4d11ee8542f2ffd15ab466c447c1f10e5b8aba6f561967276",
+ "sha256:67fdd2f652f8d4840042e2d2d91e15636ba2bcdcd92e7e5ffbc68e6ef633a754",
+ "sha256:79e3bab19cf1b021b613567c22eb18b76c0c547b9bc3903881a07bfd9e7e64cf",
+ "sha256:856d2fec682b3f31583719cb6925c6cdbb9aa30f03122bcc45c65c8b6f515754",
+ "sha256:8669cadeb26d9a58a5e6b8ce34d2c8986cc3b5c0bfa77eda6ceb471596cb2ec3",
+ "sha256:8733cf4758b79304f2a4e39ebfac5e92341bce47bcceb26c1254398b2f8c1af7",
+ "sha256:97335b2cd4acebf30d14e2855d882de83ad838491a09be2011745579ac975833",
+ "sha256:afbce249133a830f121b917f8c9404a44f2950e0e4f5d1e68f043da4c2e9f457",
+ "sha256:b0595aee62809ba353cebc5c7978e0e443760c3e882e2c7672c73ffe46383673",
+ "sha256:b6da3063c5c476f5311fd76854abae6c315f1513ef7d7904deed2e774623bbb9",
+ "sha256:c8e8025f496b5acb6ba95da2fb3e9dacffc97d9a92711aacfdd42f9c5927e094",
+ "sha256:cddc47ec746a08603037731f5d10aebf770ced08666100bd2cdcaf06a85d4d1b",
+ "sha256:cdf10af3e2e3279dc09fdc5f95deaa624850a53913f30350ceee824dc14fc1a6",
+ "sha256:d968ffc403d92addf20b68924d95428d523436adfd25cf505d427ed7ba3bee8b",
+ "sha256:dbee03b8d42dca924e6aa057b836a064c769ddfd2a4c2919e65da2c8a362d528",
+ "sha256:e1958e0227a4a6a2c06fd6e35b7469de50adf174102454db397cec6e1403cce3",
+ "sha256:e6ffa08aa1c111a1ebcbd1cf4afaaec120bc0bbdec3f2545f8bb7d3e8e77a1cd",
+ "sha256:e83228e0994497900af954adcac27f64c9a57cd70a9ec768ab0cb2c01fd15cf1",
+ "sha256:ea1dcc07bfb37367b639415333cfbfe4a93c3be340edf1db10964bc27d42ed64",
+ "sha256:eca3065a1279456e81c615211d025ea11bfe4e19f0c5650b859868ca04b3fcbd",
+ "sha256:ed087eeb88a8cf96c60fb76c5c3b5fb87188adee5e179f89ec9ad9a43c0c54b3",
+ "sha256:eeb475eb6f0ccf6c039035eb4f0f928eb53ead88777e0a760eccb140ad90930b",
+ "sha256:eefd2b9a5f38ded8d859fe96cc28d7d06e098dc3f677e7adbafda4dcdd4a461c",
+ "sha256:f3fd44cc362eee8ae569025f070d56208908916794b6ab21e139cea56470a2b3",
+ "sha256:f9944013588a3543fca795fffb0a070a31a243aa4f2d212f118aa95e69485831"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.29.32"
},
"dateparser": {
"hashes": [
- "sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a",
- "sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8"
+ "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
+ "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
],
"index": "ia",
- "version": "==1.0.0"
+ "version": "==1.1.4"
},
"dawg": {
"hashes": [
@@ -251,10 +342,11 @@
},
"decorator": {
"hashes": [
- "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
- "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==4.4.2"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
},
"docopt": {
"hashes": [
@@ -270,54 +362,58 @@
},
"dynaconf": {
"hashes": [
- "sha256:808adfe964f10695846dbf8dad7632e47fc3bc38860fd1887ed57dddffc4eff2",
- "sha256:9b34ab2f811a81755f5eb4beac77a69e1e0887528c7e37fc4bc83fed52dcf502"
+ "sha256:87e0b3b12b5db9e8fb465e1f8c7fdb926cd2ec5b6d88aa7f821f316df93fb165",
+ "sha256:d9cfb50fd4a71a543fd23845d4f585b620b6ff6d9d3cc1825c614f7b2097cb39"
],
"index": "ia",
- "version": "==3.1.2"
+ "version": "==3.1.11"
},
"elasticsearch": {
"hashes": [
- "sha256:4ebd34fd223b31c99d9f3b6b6236d3ac18b3046191a37231e8235b06ae7db955",
- "sha256:a725dd923d349ca0652cf95d6ce23d952e2153740cf4ab6daf4a2d804feeed48"
+ "sha256:840adeb45a5ec9102a83f3cf481aae83a3775b75d6dd83a7310b04e44a5d0308",
+ "sha256:f511ea92e96db09b0e96b0de5fbbb7aa5c3740b0c571a364a2c3a1cc7ec06203"
],
- "version": "==7.10.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'",
+ "version": "==7.17.8"
},
"filelock": {
"hashes": [
- "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59",
- "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"
+ "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
+ "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
],
- "version": "==3.0.12"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.8.2"
},
- "flask": {
+ "ftfy": {
"hashes": [
- "sha256:4efa1ae2d7c9865af48986de8aeb8504bf32c7f3d6fdc9353d34b21f4b127060",
- "sha256:8a4fdd8936eba2512e9c85df320a37e694c93945b33ef33c89946a340a238557"
+ "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca",
+ "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"
],
"index": "ia",
- "version": "==1.1.2"
+ "version": "==6.1.1"
},
- "ftfy": {
+ "globalwayback": {
"hashes": [
- "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720"
+ "sha256:683f19dee720ef11335952aa33615e50c945196c82e18a5d8150635f92022d23"
],
"index": "ia",
- "version": "==5.8"
+ "version": "==0.8.12.6"
},
- "globalwayback": {
+ "grobid-tei-xml": {
"hashes": [
- "sha256:579822d0521ceb0427b9e617ff5355ff5742c9927d571d11711954f761d7e81a"
+ "sha256:022fdf54dbd067b520c1effe3c1a1f2ac248492ea310627e9462757748cb461b",
+ "sha256:35c9afb14f6f76100dce5f5815e67ec9fa4122e2f268394e0baf6eafbd8668d8"
],
"index": "ia",
- "version": "==0.6.8"
+ "version": "==0.1.3"
},
"htmldate": {
"hashes": [
- "sha256:03f4e9648bf5bade11ecdb2a825a06019fafbfdcafd88151a4ce0407325f43c7",
- "sha256:2e383fdbac3fb8a3cc6307502d7b920bb10f938113a1d108ec315aa195a2bc28"
+ "sha256:603b86eaf0f076efcd653d57fe0470305f751417711f4e373279235d0ff587e6",
+ "sha256:83830715faf0f22272d9e24e571a4955308a008107d0ca9359c0de77b99766cd"
],
- "version": "==0.7.2"
+ "index": "ia",
+ "version": "==1.3.2"
},
"ialib": {
"hashes": [
@@ -334,149 +430,201 @@
},
"internetarchive": {
"hashes": [
- "sha256:0e9b24577086283280a5089b3e65086640b4e42d61ca4af913639f45b02b9e4c",
- "sha256:bf28ab57939a80a61c2cf66bb7173ea1989013494dab564c99035574d5b4faea"
+ "sha256:de856465c2ef6852184d08bfd59c0ca01904865b373a27b383034ac6b4128eb6"
],
"index": "ia",
- "version": "==1.9.6"
- },
- "itsdangerous": {
- "hashes": [
- "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
- "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
- ],
- "version": "==1.1.0"
+ "version": "==3.0.2"
},
"jinja2": {
"hashes": [
- "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
- "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
+ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
+ "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
- "version": "==2.11.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.1.2"
},
"jmespath": {
"hashes": [
- "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
- "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
+ "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",
+ "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"
],
- "version": "==0.10.0"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.0.1"
},
"jsonpatch": {
"hashes": [
- "sha256:da3831be60919e8c98564acfc1fa918cb96e7c9750b0428388483f04d0d1c5a7",
- "sha256:e930adc932e4d36087dbbf0f22e1ded32185dfb20662f2e3dd848677a5295a14"
+ "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397",
+ "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"
],
- "version": "==1.28"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.32"
},
"jsonpointer": {
"hashes": [
- "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
- "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
+ "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9",
+ "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"
],
- "version": "==2.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==2.3"
},
"justext": {
"hashes": [
- "sha256:330035dfaaa960465276afa1836dfb6e63791011a8dfc6da2757142cc4d14d54",
- "sha256:4b8b7f0749e8725f0089ebe0239c1a45286d61bf507b3f05d136c2700dea4aa6"
+ "sha256:7640e248218795f6be65f6c35fe697325a3280fcb4675d1525bcdff2b86faadf",
+ "sha256:86b48f5b1d99505acd072f5831def6cd3f1306043651c524a1c609e62e3544e4"
],
- "version": "==2.2.0"
+ "version": "==3.0.0"
},
- "lxml": {
+ "langcodes": {
"hashes": [
- "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
- "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
- "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
- "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
- "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
- "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
- "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
- "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
- "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
- "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
- "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
- "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
- "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
- "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
- "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
- "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
- "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
- "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
- "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
- "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
- "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
- "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
- "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
- "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
- "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
- "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
- "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
- "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
- "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
- "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
- "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
- "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
- "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
- "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
- "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
- "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
- "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
+ "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69",
+ "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"
],
- "markers": "python_version >= '3.5'",
- "version": "==4.6.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.3.0"
+ },
+ "lxml": {
+ "hashes": [
+ "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7",
+ "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726",
+ "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03",
+ "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140",
+ "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a",
+ "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05",
+ "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03",
+ "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419",
+ "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4",
+ "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e",
+ "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67",
+ "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50",
+ "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894",
+ "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf",
+ "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947",
+ "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1",
+ "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd",
+ "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3",
+ "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92",
+ "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3",
+ "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457",
+ "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74",
+ "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf",
+ "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1",
+ "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4",
+ "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975",
+ "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5",
+ "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe",
+ "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7",
+ "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1",
+ "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2",
+ "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409",
+ "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f",
+ "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f",
+ "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5",
+ "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24",
+ "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e",
+ "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4",
+ "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a",
+ "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c",
+ "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de",
+ "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f",
+ "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b",
+ "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5",
+ "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7",
+ "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a",
+ "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c",
+ "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9",
+ "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e",
+ "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab",
+ "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941",
+ "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5",
+ "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45",
+ "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7",
+ "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892",
+ "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746",
+ "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c",
+ "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53",
+ "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe",
+ "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184",
+ "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38",
+ "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df",
+ "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9",
+ "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b",
+ "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2",
+ "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0",
+ "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda",
+ "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b",
+ "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5",
+ "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380",
+ "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33",
+ "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8",
+ "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1",
+ "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889",
+ "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9",
+ "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f",
+ "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==4.9.2"
},
"markupsafe": {
"hashes": [
- "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
- "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
- "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
- "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
- "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
- "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
- "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
- "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
- "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
- "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
- "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
- "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
- "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
- "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
- "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
- "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
- "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
- "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
- "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
- "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
- "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
- "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
- "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
- "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
- "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
- "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
- "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
- "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
- "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
- "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
- "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
- ],
- "version": "==1.1.1"
+ "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
+ "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
+ "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
+ "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
+ "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
+ "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
+ "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
+ "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
+ "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
+ "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
+ "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
+ "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
+ "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
+ "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
+ "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
+ "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
+ "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
+ "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
+ "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
+ "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
+ "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
+ "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
+ "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
+ "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
+ "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
+ "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
+ "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
+ "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
+ "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
+ "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
+ "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
+ "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
+ "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
+ "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
+ "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
+ "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
+ "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
+ "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
+ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
+ "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.1"
},
"minio": {
"hashes": [
- "sha256:bae5d060796ba3eb8dbdd2bbb7dc83da58ec4aca284689489467cacdc4a8b112",
- "sha256:f96d7f461ff88d7df2932eecf55f955ed93874c7ed84bc9454b2d7955de768fe"
+ "sha256:7cb075b56bac894551304cb824f958069a84e0dd2d0a685f9bed3c05e15727bf",
+ "sha256:acae9bfae0aec1b92025bd63e18135ebb4994c84600716c5323e14cb0c9a0b03",
+ "sha256:eec4ab073ff979c34e928e532d8acc1d40d61ba4404709cf27ab3ecdcfa2a561"
],
"index": "ia",
- "version": "==7.0.0"
+ "version": "==6.0.2"
},
- "pathspec": {
+ "perfstat": {
"hashes": [
- "sha256:86379d6b86d75816baba717e64b1a3a3469deb93bb76d613c9ce79edc5cb68fd",
- "sha256:aa0cb481c4041bf52ffa7b0d8fa6cd3e88a2ca4879c533c9153882ee2556790d"
+ "sha256:4f91fab9be6076972c66fe818eed488be28f1044009237adccce42ff2c7861f5"
],
- "version": "==0.8.1"
+ "version": "==0.1.0.1"
},
"pillow": {
"hashes": [
@@ -535,29 +683,22 @@
},
"psycopg2": {
"hashes": [
- "sha256:00195b5f6832dbf2876b8bf77f12bdce648224c89c880719c745b90515233301",
- "sha256:068115e13c70dc5982dfc00c5d70437fe37c014c808acce119b5448361c03725",
- "sha256:17d50b4966818e09e7221f2d64667a7a2fbb43cff6210d6fb6236a16fe8fc622",
- "sha256:26e7fd115a6db75267b325de0fba089b911a4a12ebd3d0b5e7acb7028bc46821",
- "sha256:2c93d4d16933fea5bbacbe1aaf8fa8c1348740b2e50b3735d1b0bf8154cbf0f3",
- "sha256:56007a226b8e95aa980ada7abdea6b40b75ce62a433bd27cec7a8178d57f4051",
- "sha256:56fee7f818d032f802b8eed81ef0c1232b8b42390df189cab9cfa87573fe52c5",
- "sha256:6a3d9efb6f36f1fe6aa8dbb5af55e067db802502c55a9defa47c5a1dad41df84",
- "sha256:6c237e85e534045ea8e9a49ba57fe1c362b564aaac4940083fef7e74c6bf64cc",
- "sha256:7a02112996b0dd47a5a2b13c67b301284ebcc68ce7f4881d1f97f8598fe6f1f5",
- "sha256:7e82d44fc5327d0e6b0f7428bc572a645e6cfa8647860ce1da0d262e548ad921",
- "sha256:a49833abfdede8985ba3f3ec641f771cca215479f41523e99dace96d5b8cce2a",
- "sha256:ad2fe8a37be669082e61fb001c185ffb58867fdbb3e7a6b0b0d2ffe232353a3e",
- "sha256:b8cae8b2f022efa1f011cc753adb9cbadfa5a184431d09b273fb49b4167561ad",
- "sha256:d160744652e81c80627a909a0e808f3c6653a40af435744de037e3172cf277f5",
- "sha256:d5062ae50b222da28253059880a871dc87e099c25cb68acf613d9d227413d6f7",
- "sha256:f155cf65726e4afc2316028fcc5791a1bf384cf2c96562b8b97f18c1fb64f272",
- "sha256:f22ea9b67aea4f4a1718300908a2fb62b3e4276cf00bd829a97ab5894af42ea3",
- "sha256:f974c96fca34ae9e4f49839ba6b78addf0346777b46c4da27a7bf54f48d3057d",
- "sha256:fb23f6c71107c37fd667cb4ea363ddeb936b348bbd6449278eb92c189699f543"
+ "sha256:093e3894d2d3c592ab0945d9eba9d139c139664dcf83a1c440b8a7aa9bb21955",
+ "sha256:190d51e8c1b25a47484e52a79638a8182451d6f6dff99f26ad9bd81e5359a0fa",
+ "sha256:1a5c7d7d577e0eabfcf15eb87d1e19314c8c4f0e722a301f98e0e3a65e238b4e",
+ "sha256:1e5a38aa85bd660c53947bd28aeaafb6a97d70423606f1ccb044a03a1203fe4a",
+ "sha256:322fd5fca0b1113677089d4ebd5222c964b1760e361f151cbb2706c4912112c5",
+ "sha256:4cb9936316d88bfab614666eb9e32995e794ed0f8f6b3b718666c22819c1d7ee",
+ "sha256:920bf418000dd17669d2904472efeab2b20546efd0548139618f8fa305d1d7ad",
+ "sha256:922cc5f0b98a5f2b1ff481f5551b95cd04580fd6f0c72d9b22e6c0145a4840e0",
+ "sha256:a5246d2e683a972e2187a8714b5c2cf8156c064629f9a9b1a873c1730d9e245a",
+ "sha256:b9ac1b0d8ecc49e05e4e182694f418d27f3aedcfca854ebd6c05bb1cffa10d6d",
+ "sha256:d3ef67e630b0de0779c42912fe2cbae3805ebaba30cda27fea2a3de650a9414f",
+ "sha256:f5b6320dbc3cf6cfb9f25308286f9f7ab464e65cfb105b64cc9c52831748ced2",
+ "sha256:fc04dd5189b90d825509caa510f20d1d504761e78b8dfb95a0ede180f71d50e5"
],
"index": "ia",
- "version": "==2.8.6"
+ "version": "==2.9.5"
},
"publicsuffix": {
"hashes": [
@@ -567,86 +708,171 @@
},
"pydantic": {
"hashes": [
- "sha256:025bf13ce27990acc059d0c5be46f416fc9b293f45363b3d19855165fee1874f",
- "sha256:185e18134bec5ef43351149fe34fda4758e53d05bb8ea4d5928f0720997b79ef",
- "sha256:213125b7e9e64713d16d988d10997dabc6a1f73f3991e1ff8e35ebb1409c7dc9",
- "sha256:24ca47365be2a5a3cc3f4a26dcc755bcdc9f0036f55dcedbd55663662ba145ec",
- "sha256:38be427ea01a78206bcaf9a56f835784afcba9e5b88fbdce33bbbfbcd7841229",
- "sha256:475f2fa134cf272d6631072554f845d0630907fce053926ff634cc6bc45bf1af",
- "sha256:514b473d264671a5c672dfb28bdfe1bf1afd390f6b206aa2ec9fed7fc592c48e",
- "sha256:59e45f3b694b05a69032a0d603c32d453a23f0de80844fb14d55ab0c6c78ff2f",
- "sha256:5b24e8a572e4b4c18f614004dda8c9f2c07328cb5b6e314d6e1bbd536cb1a6c1",
- "sha256:6e3874aa7e8babd37b40c4504e3a94cc2023696ced5a0500949f3347664ff8e2",
- "sha256:8d72e814c7821125b16f1553124d12faba88e85405b0864328899aceaad7282b",
- "sha256:a4143c8d0c456a093387b96e0f5ee941a950992904d88bc816b4f0e72c9a0009",
- "sha256:b2b054d095b6431cdda2f852a6d2f0fdec77686b305c57961b4c5dd6d863bf3c",
- "sha256:c59ea046aea25be14dc22d69c97bee629e6d48d2b2ecb724d7fe8806bf5f61cd",
- "sha256:d1fe3f0df8ac0f3a9792666c69a7cd70530f329036426d06b4f899c025aca74e",
- "sha256:d8df4b9090b595511906fa48deda47af04e7d092318bfb291f4d45dfb6bb2127",
- "sha256:dba5c1f0a3aeea5083e75db9660935da90216f8a81b6d68e67f54e135ed5eb23",
- "sha256:e682f6442ebe4e50cb5e1cfde7dda6766fb586631c3e5569f6aa1951fd1a76ef",
- "sha256:ecb54491f98544c12c66ff3d15e701612fc388161fd455242447083350904730",
- "sha256:f5b06f5099e163295b8ff5b1b71132ecf5866cc6e7f586d78d7d3fd6e8084608",
- "sha256:f6864844b039805add62ebe8a8c676286340ba0c6d043ae5dea24114b82a319e",
- "sha256:ffd180ebd5dd2a9ac0da4e8b995c9c99e7c74c31f985ba090ee01d681b1c4b95"
+ "sha256:05e00dbebbe810b33c7a7362f231893183bcc4251f3f2ff991c31d5c08240c42",
+ "sha256:06094d18dd5e6f2bbf93efa54991c3240964bb663b87729ac340eb5014310624",
+ "sha256:0b959f4d8211fc964772b595ebb25f7652da3f22322c007b6fed26846a40685e",
+ "sha256:19b3b9ccf97af2b7519c42032441a891a5e05c68368f40865a90eb88833c2559",
+ "sha256:1b6ee725bd6e83ec78b1aa32c5b1fa67a3a65badddde3976bca5fe4568f27709",
+ "sha256:1ee433e274268a4b0c8fde7ad9d58ecba12b069a033ecc4645bb6303c062d2e9",
+ "sha256:216f3bcbf19c726b1cc22b099dd409aa371f55c08800bcea4c44c8f74b73478d",
+ "sha256:2d0567e60eb01bccda3a4df01df677adf6b437958d35c12a3ac3e0f078b0ee52",
+ "sha256:2e05aed07fa02231dbf03d0adb1be1d79cabb09025dd45aa094aa8b4e7b9dcda",
+ "sha256:352aedb1d71b8b0736c6d56ad2bd34c6982720644b0624462059ab29bd6e5912",
+ "sha256:355639d9afc76bcb9b0c3000ddcd08472ae75318a6eb67a15866b87e2efa168c",
+ "sha256:37c90345ec7dd2f1bcef82ce49b6235b40f282b94d3eec47e801baf864d15525",
+ "sha256:4b8795290deaae348c4eba0cebb196e1c6b98bdbe7f50b2d0d9a4a99716342fe",
+ "sha256:5760e164b807a48a8f25f8aa1a6d857e6ce62e7ec83ea5d5c5a802eac81bad41",
+ "sha256:6eb843dcc411b6a2237a694f5e1d649fc66c6064d02b204a7e9d194dff81eb4b",
+ "sha256:7b5ba54d026c2bd2cb769d3468885f23f43710f651688e91f5fb1edcf0ee9283",
+ "sha256:7c2abc4393dea97a4ccbb4ec7d8658d4e22c4765b7b9b9445588f16c71ad9965",
+ "sha256:81a7b66c3f499108b448f3f004801fcd7d7165fb4200acb03f1c2402da73ce4c",
+ "sha256:91b8e218852ef6007c2b98cd861601c6a09f1aa32bbbb74fab5b1c33d4a1e410",
+ "sha256:9300fcbebf85f6339a02c6994b2eb3ff1b9c8c14f502058b5bf349d42447dcf5",
+ "sha256:9cabf4a7f05a776e7793e72793cd92cc865ea0e83a819f9ae4ecccb1b8aa6116",
+ "sha256:a1f5a63a6dfe19d719b1b6e6106561869d2efaca6167f84f5ab9347887d78b98",
+ "sha256:a4c805731c33a8db4b6ace45ce440c4ef5336e712508b4d9e1aafa617dc9907f",
+ "sha256:ae544c47bec47a86bc7d350f965d8b15540e27e5aa4f55170ac6a75e5f73b644",
+ "sha256:b97890e56a694486f772d36efd2ba31612739bc6f3caeee50e9e7e3ebd2fdd13",
+ "sha256:bb6ad4489af1bac6955d38ebcb95079a836af31e4c4f74aba1ca05bb9f6027bd",
+ "sha256:bedf309630209e78582ffacda64a21f96f3ed2e51fbf3962d4d488e503420254",
+ "sha256:c1ba1afb396148bbc70e9eaa8c06c1716fdddabaf86e7027c5988bae2a829ab6",
+ "sha256:c33602f93bfb67779f9c507e4d69451664524389546bacfe1bee13cae6dc7488",
+ "sha256:c4aac8e7103bf598373208f6299fa9a5cfd1fc571f2d40bf1dd1955a63d6eeb5",
+ "sha256:c6f981882aea41e021f72779ce2a4e87267458cc4d39ea990729e21ef18f0f8c",
+ "sha256:cc78cc83110d2f275ec1970e7a831f4e371ee92405332ebfe9860a715f8336e1",
+ "sha256:d49f3db871575e0426b12e2f32fdb25e579dea16486a26e5a0474af87cb1ab0a",
+ "sha256:dd3f9a40c16daf323cf913593083698caee97df2804aa36c4b3175d5ac1b92a2",
+ "sha256:e0bedafe4bc165ad0a56ac0bd7695df25c50f76961da29c050712596cf092d6d",
+ "sha256:e9069e1b01525a96e6ff49e25876d90d5a563bc31c658289a8772ae186552236"
],
"index": "ia",
- "version": "==1.7.3"
+ "version": "==1.10.2"
},
"pylru": {
"hashes": [
- "sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
+ "sha256:47ad140a63ab9389648dadfbb4330700e0ffeeb28ec04664ee47d37ed133b0f4",
+ "sha256:b7c75b0676e2fbae647823bc209e23998772867d3679f1583c7350a9b02a59f0"
+ ],
+ "version": "==1.2.1"
+ },
+ "pymupdf": {
+ "hashes": [
+ "sha256:05c54acf69ee55ef97453f9c52982ef2839c188fe464d6b4cdc053bd4c6298f1",
+ "sha256:11b913664c059146e512e8559ebd9f976570ef21c0338c953836bc02051c1d7e",
+ "sha256:13ed689e5ad4c3adecb7586050de8baaa1819f48e2c57ca4e87f80e3b2727cb3",
+ "sha256:164dc67f1f5db3b22207b2aeba0fadff0503123c8f31c46768b7da7d3595a181",
+ "sha256:1e7b85e2611a9cca7a410e4c5a510a11131de7c5da9379e46615a8d3adfa6df5",
+ "sha256:38188f88a6e648b9f3a87d29de5b4ed52f910827a15859b183f1321c68e6ac00",
+ "sha256:39192c009afd8dd877a79ed02519ec8d17699bec9e9543115e490f06a553e200",
+ "sha256:4c5e7211b85e13050ac6e25879d4f0476b7a04f23bd3b6442489cec9f8da8418",
+ "sha256:7281324a0325dd30c033644cc8654167dcbfe47c4b1d49805d407fa5a64ce76b",
+ "sha256:909fb46900e7422515291761a1294902cf163226ec8918ea4c3454537336dfeb",
+ "sha256:945529b7868f9fe290b11dfbc37e2b9012610fac9763686ccf91a4d968305c5e",
+ "sha256:976fb0e93f025617890f8f8d8517371684131aa0e9fc0c1d0b4cd8bd564cce27",
+ "sha256:9998f7dfa0f99d6c2c3eb0dcfbfd44433247c23c4b781bc45f76dab421bc554b",
+ "sha256:a3b8e5c2de6192c89f379283aa07aa7fd044098dab43a8cd3ac172e961caf286",
+ "sha256:b0db8c81b6c781e373ed005f7595e49b760f91edb3b36d1dc69ec29b4fad34f8",
+ "sha256:c03004415a6d140b2c4bb494bb507c9ccbd55d713407e3b5bc1dd35fa45f2be0",
+ "sha256:cfd6c666b02a066e9e76d9ce8ca5e7fa4f2bf7a8ce6934cd2837b08509d46f8e",
+ "sha256:dffe67c5574d0ebb1e39b5ecf806fb4fd4ddb01bee5630f516ece4468252c9f0",
+ "sha256:ef3d13e27f1585d776f6a2597f113aabd28d36b648b983a72850b21c5399ab08",
+ "sha256:f04086036d40af50e5d6f54e949fa12eacda2d752562a2f85215763b137bf864",
+ "sha256:f3f96bd465e9e0e2960bb70e92233af0865181b9dd8ac5bc6b159d79584df2fe"
],
- "version": "==1.2.0"
- },
- "pymysql": {
- "hashes": [
- "sha256:263040d2779a3b84930f7ac9da5132be0fefcd6f453a885756656103f8ee1fdd",
- "sha256:44f47128dda8676e021c8d2dbb49a82be9e4ab158b9f03e897152a3a287c69ea"
- ],
- "version": "==0.10.1"
+ "index": "ia",
+ "version": "==1.19.6"
},
"python-dateutil": {
"hashes": [
- "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
- "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
+ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+ "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
- "version": "==2.8.1"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==2.8.2"
},
"python-magic": {
"hashes": [
- "sha256:356efa93c8899047d1eb7d3eb91e871ba2f5b1376edbaf4cc305e3c872207355",
- "sha256:b757db2a5289ea3f1ced9e60f072965243ea43a2221430048fd8cacab17be0ce"
+ "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b",
+ "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"
],
"index": "ia",
- "version": "==0.4.18"
+ "version": "==0.4.27"
},
"python-poppler": {
"hashes": [
- "sha256:6843398adc9c290035646c4cf3c7bfcea9c8e04390bb9cd8fdc9bd063fb77880"
+ "sha256:8b6a157e51cbb4c08353a21ca3f6f396558759cdfb0b80071379ad89d5f7c533"
],
"index": "ia",
- "version": "==0.2.2"
+ "version": "==0.3.0"
},
"python-snappy": {
"hashes": [
- "sha256:9c0ba725755b749ef9b03f6ed7582cefb957c0d9f6f064a7c4314148a9dbdb61",
- "sha256:a745b3732750e2e627adf45fe2669b18afb4170431b0d100da041f807bdea0c8",
- "sha256:ac48ec6146d71627bba0fe4857984ac1f3f70a35c12eed0f91b46f353952d5fa",
- "sha256:b08db966a9c041220b1b602a2e36498dc0755b46b0d8b119f568de71804b9aed",
- "sha256:d9c26532cfa510f45e8d135cde140e8a5603d3fb254cfec273ebc0ecf9f668e2",
- "sha256:f21e8472a7f11b65b4bb5aea1c12624e2d4199aa586c57a11faa0de86a3053a6",
- "sha256:f8bbf1e04d0ec722a7f2e16f2c179f5ada4cfc0ac1196703225894303b061dbb"
+ "sha256:03bb511380fca2a13325b6f16fe8234c8e12da9660f0258cd45d9a02ffc916af",
+ "sha256:0bdb6942180660bda7f7d01f4c0def3cfc72b1c6d99aad964801775a3e379aba",
+ "sha256:0d489b50f49433494160c45048fe806de6b3aeab0586e497ebd22a0bab56e427",
+ "sha256:1a993dc8aadd901915a510fe6af5f20ae4256f527040066c22a154db8946751f",
+ "sha256:1d029f7051ec1bbeaa3e03030b6d8ed47ceb69cae9016f493c802a08af54e026",
+ "sha256:277757d5dad4e239dc1417438a0871b65b1b155beb108888e7438c27ffc6a8cc",
+ "sha256:2a7e528ab6e09c0d67dcb61a1730a292683e5ff9bb088950638d3170cf2a0a54",
+ "sha256:2aaaf618c68d8c9daebc23a20436bd01b09ee70d7fbf7072b7f38b06d2fab539",
+ "sha256:2be4f4550acd484912441f5f1209ba611ac399aac9355fee73611b9a0d4f949c",
+ "sha256:39692bedbe0b717001a99915ac0eb2d9d0bad546440d392a2042b96d813eede1",
+ "sha256:3fb9a88a4dd6336488f3de67ce75816d0d796dce53c2c6e4d70e0b565633c7fd",
+ "sha256:4038019b1bcaadde726a57430718394076c5a21545ebc5badad2c045a09546cf",
+ "sha256:463fd340a499d47b26ca42d2f36a639188738f6e2098c6dbf80aef0e60f461e1",
+ "sha256:4d3cafdf454354a621c8ab7408e45aa4e9d5c0b943b61ff4815f71ca6bdf0130",
+ "sha256:4ec533a8c1f8df797bded662ec3e494d225b37855bb63eb0d75464a07947477c",
+ "sha256:530bfb9efebcc1aab8bb4ebcbd92b54477eed11f6cf499355e882970a6d3aa7d",
+ "sha256:546c1a7470ecbf6239101e9aff0f709b68ca0f0268b34d9023019a55baa1f7c6",
+ "sha256:5843feb914796b1f0405ccf31ea0fb51034ceb65a7588edfd5a8250cb369e3b2",
+ "sha256:586724a0276d7a6083a17259d0b51622e492289a9998848a1b01b6441ca12b2f",
+ "sha256:59e975be4206cc54d0a112ef72fa3970a57c2b1bcc2c97ed41d6df0ebe518228",
+ "sha256:5a453c45178d7864c1bdd6bfe0ee3ed2883f63b9ba2c9bb967c6b586bf763f96",
+ "sha256:5bb05c28298803a74add08ba496879242ef159c75bc86a5406fac0ffc7dd021b",
+ "sha256:5e973e637112391f05581f427659c05b30b6843bc522a65be35ac7b18ce3dedd",
+ "sha256:66c80e9b366012dbee262bb1869e4fc5ba8786cda85928481528bc4a72ec2ee8",
+ "sha256:6a7620404da966f637b9ce8d4d3d543d363223f7a12452a575189c5355fc2d25",
+ "sha256:6f8bf4708a11b47517baf962f9a02196478bbb10fdb9582add4aa1459fa82380",
+ "sha256:735cd4528c55dbe4516d6d2b403331a99fc304f8feded8ae887cf97b67d589bb",
+ "sha256:7778c224efc38a40d274da4eb82a04cac27aae20012372a7db3c4bbd8926c4d4",
+ "sha256:8277d1f6282463c40761f802b742f833f9f2449fcdbb20a96579aa05c8feb614",
+ "sha256:88b6ea78b83d2796f330b0af1b70cdd3965dbdab02d8ac293260ec2c8fe340ee",
+ "sha256:8c07220408d3268e8268c9351c5c08041bc6f8c6172e59d398b71020df108541",
+ "sha256:8d0c019ee7dcf2c60e240877107cddbd95a5b1081787579bf179938392d66480",
+ "sha256:90b0186516b7a101c14764b0c25931b741fb0102f21253eff67847b4742dfc72",
+ "sha256:9837ac1650cc68d22a3cf5f15fb62c6964747d16cecc8b22431f113d6e39555d",
+ "sha256:9eac51307c6a1a38d5f86ebabc26a889fddf20cbba7a116ccb54ba1446601d5b",
+ "sha256:9f0c0d88b84259f93c3aa46398680646f2c23e43394779758d9f739c34e15295",
+ "sha256:a0ad38bc98d0b0497a0b0dbc29409bcabfcecff4511ed7063403c86de16927bc",
+ "sha256:b265cde49774752aec9ca7f5d272e3f98718164afc85521622a8a5394158a2b5",
+ "sha256:b6a107ab06206acc5359d4c5632bd9b22d448702a79b3169b0c62e0fb808bb2a",
+ "sha256:b7f920eaf46ebf41bd26f9df51c160d40f9e00b7b48471c3438cb8d027f7fb9b",
+ "sha256:c20498bd712b6e31a4402e1d027a1cd64f6a4a0066a3fe3c7344475886d07fdf",
+ "sha256:cb18d9cd7b3f35a2f5af47bb8ed6a5bdbf4f3ddee37f3daade4ab7864c292f5b",
+ "sha256:cf5bb9254e1c38aacf253d510d3d9be631bba21f3d068b17672b38b5cbf2fff5",
+ "sha256:d017775851a778ec9cc32651c4464079d06d927303c2dde9ae9830ccf6fe94e1",
+ "sha256:dc96668d9c7cc656609764275c5f8da58ef56d89bdd6810f6923d36497468ff7",
+ "sha256:e066a0586833d610c4bbddba0be5ba0e3e4f8e0bc5bb6d82103d8f8fc47bb59a",
+ "sha256:e3a013895c64352b49d0d8e107a84f99631b16dbab156ded33ebf0becf56c8b2",
+ "sha256:eaf905a580f2747c4a474040a5063cd5e0cc3d1d2d6edb65f28196186493ad4a"
],
"index": "ia",
- "version": "==0.5.4"
+ "version": "==0.6.1"
},
"pytz": {
"hashes": [
- "sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4",
- "sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"
+ "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
+ "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
+ ],
+ "version": "==2022.7"
+ },
+ "pytz-deprecation-shim": {
+ "hashes": [
+ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
+ "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
- "version": "==2020.5"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==0.1.0.post0"
},
"pyyaml": {
"hashes": [
@@ -666,84 +892,210 @@
],
"version": "==5.3.1"
},
- "raven": {
- "extras": [
- "flask"
- ],
- "hashes": [
- "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
- "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
- ],
- "index": "ia",
- "version": "==6.10.0"
- },
- "readability-lxml": {
- "hashes": [
- "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305",
- "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"
- ],
- "version": "==0.8.1"
+ "rapidfuzz": {
+ "hashes": [
+ "sha256:020858dd89b60ce38811cd6e37875c4c3c8d7fcd8bc20a0ad2ed1f464b34dc4e",
+ "sha256:042644133244bfa7b20de635d500eb9f46af7097f3d90b1724f94866f17cb55e",
+ "sha256:08590905a95ccfa43f4df353dcc5d28c15d70664299c64abcad8721d89adce4f",
+ "sha256:114810491efb25464016fd554fdf1e20d390309cecef62587494fc474d4b926f",
+ "sha256:1333fb3d603d6b1040e365dca4892ba72c7e896df77a54eae27dc07db90906e3",
+ "sha256:16080c05a63d6042643ae9b6cfec1aefd3e61cef53d0abe0df3069b9d4b72077",
+ "sha256:16ffad751f43ab61001187b3fb4a9447ec2d1aedeff7c5bac86d3b95f9980cc3",
+ "sha256:1f50d1227e6e2a0e3ae1fb1c9a2e1c59577d3051af72c7cab2bcc430cb5e18da",
+ "sha256:1fbad8fb28d98980f5bff33c7842efef0315d42f0cd59082108482a7e6b61410",
+ "sha256:23524635840500ce6f4d25005c9529a97621689c85d2f727c52eed1782839a6a",
+ "sha256:24d3fea10680d085fd0a4d76e581bfb2b1074e66e78fd5964d4559e1fcd2a2d4",
+ "sha256:24eb6b843492bdc63c79ee4b2f104059b7a2201fef17f25177f585d3be03405a",
+ "sha256:25b4cedf2aa19fb7212894ce5f5219010cce611b60350e9a0a4d492122e7b351",
+ "sha256:27be9c63215d302ede7d654142a2e21f0d34ea6acba512a4ae4cfd52bbaa5b59",
+ "sha256:2c836f0f2d33d4614c3fbaf9a1eb5407c0fe23f8876f47fd15b90f78daa64c34",
+ "sha256:3a9bd02e1679c0fd2ecf69b72d0652dbe2a9844eaf04a36ddf4adfbd70010e95",
+ "sha256:3d8b081988d0a49c486e4e845a547565fee7c6e7ad8be57ff29c3d7c14c6894c",
+ "sha256:3dcffe1f3cbda0dc32133a2ae2255526561ca594f15f9644384549037b355245",
+ "sha256:3f11a7eff7bc6301cd6a5d43f309e22a815af07e1f08eeb2182892fca04c86cb",
+ "sha256:42085d4b154a8232767de8296ac39c8af5bccee6b823b0507de35f51c9cbc2d7",
+ "sha256:424f82c35dbe4f83bdc3b490d7d696a1dc6423b3d911460f5493b7ffae999fd2",
+ "sha256:43fb8cb030f888c3f076d40d428ed5eb4331f5dd6cf1796cfa39c67bf0f0fc1e",
+ "sha256:460853983ab88f873173e27cc601c5276d469388e6ad6e08c4fd57b2a86f1064",
+ "sha256:467c1505362823a5af12b10234cb1c4771ccf124c00e3fc9a43696512bd52293",
+ "sha256:46b9b8aa09998bc48dd800854e8d9b74bc534d7922c1d6e1bbf783e7fa6ac29c",
+ "sha256:53dcae85956853b787c27c1cb06f18bb450e22cf57a4ad3444cf03b8ff31724a",
+ "sha256:585206112c294e335d84de5d5f179c0f932837752d7420e3de21db7fdc476278",
+ "sha256:5ada0a14c67452358c1ee52ad14b80517a87b944897aaec3e875279371a9cb96",
+ "sha256:5e2b3d020219baa75f82a4e24b7c8adcb598c62f0e54e763c39361a9e5bad510",
+ "sha256:6120f2995f5154057454c5de99d86b4ef3b38397899b5da1265467e8980b2f60",
+ "sha256:68a89bb06d5a331511961f4d3fa7606f8e21237467ba9997cae6f67a1c2c2b9e",
+ "sha256:7496e8779905b02abc0ab4ba2a848e802ab99a6e20756ffc967a0de4900bd3da",
+ "sha256:759a3361711586a29bc753d3d1bdb862983bd9b9f37fbd7f6216c24f7c972554",
+ "sha256:75c45dcd595f8178412367e302fd022860ea025dc4a78b197b35428081ed33d5",
+ "sha256:7d005e058d86f2a968a8d28ca6f2052fab1f124a39035aa0523261d6baf21e1f",
+ "sha256:7f7930adf84301797c3f09c94b9c5a9ed90a9e8b8ed19b41d2384937e0f9f5bd",
+ "sha256:8109e0324d21993d5b2d111742bf5958f3516bf8c59f297c5d1cc25a2342eb66",
+ "sha256:81642a24798851b118f82884205fc1bd9ff70b655c04018c467824b6ecc1fabc",
+ "sha256:8450d15f7765482e86ef9be2ad1a05683cd826f59ad236ef7b9fb606464a56aa",
+ "sha256:875d51b3497439a72e2d76183e1cb5468f3f979ab2ddfc1d1f7dde3b1ecfb42f",
+ "sha256:8b477b43ced896301665183a5e0faec0f5aea2373005648da8bdcb3c4b73f280",
+ "sha256:8d3e252d4127c79b4d7c2ae47271636cbaca905c8bb46d80c7930ab906cf4b5c",
+ "sha256:916bc2e6cf492c77ad6deb7bcd088f0ce9c607aaeabc543edeb703e1fbc43e31",
+ "sha256:988f8f6abfba7ee79449f8b50687c174733b079521c3cc121d65ad2d38831846",
+ "sha256:99a84ab9ac9a823e7e93b4414f86344052a5f3e23b23aa365cda01393ad895bd",
+ "sha256:9be02162af0376d64b840f2fc8ee3366794fc149f1e06d095a6a1d42447d97c5",
+ "sha256:a5585189b3d90d81ccd62d4f18530d5ac8972021f0aaaa1ffc6af387ff1dce75",
+ "sha256:ae33a72336059213996fe4baca4e0e4860913905c2efb7c991eab33b95a98a0a",
+ "sha256:af4f7c3c904ca709493eb66ca9080b44190c38e9ecb3b48b96d38825d5672559",
+ "sha256:b20141fa6cee041917801de0bab503447196d372d4c7ee9a03721b0a8edf5337",
+ "sha256:b3210869161a864f3831635bb13d24f4708c0aa7208ef5baac1ac4d46e9b4208",
+ "sha256:b34e8c0e492949ecdd5da46a1cfc856a342e2f0389b379b1a45a3cdcd3176a6e",
+ "sha256:b52ac2626945cd21a2487aeefed794c14ee31514c8ae69b7599170418211e6f6",
+ "sha256:b5dd713a1734574c2850c566ac4286594bacbc2d60b9170b795bee4b68656625",
+ "sha256:b5f705652360d520c2de52bee11100c92f59b3e3daca308ebb150cbc58aecdad",
+ "sha256:b6389c50d8d214c9cd11a77f6d501529cb23279a9c9cafe519a3a4b503b5f72a",
+ "sha256:b6bad92de071cbffa2acd4239c1779f66851b60ffbbda0e4f4e8a2e9b17e7eef",
+ "sha256:b75dd0928ce8e216f88660ab3d5c5ffe990f4dd682fd1709dba29d5dafdde6de",
+ "sha256:c2523f8180ebd9796c18d809e9a19075a1060b1a170fde3799e83db940c1b6d5",
+ "sha256:c31022d9970177f6affc6d5dd757ed22e44a10890212032fabab903fdee3bfe7",
+ "sha256:c36fd260084bb636b9400bb92016c6bd81fd80e59ed47f2466f85eda1fc9f782",
+ "sha256:c3741cb0bf9794783028e8b0cf23dab917fa5e37a6093b94c4c2f805f8e36b9f",
+ "sha256:c3fbe449d869ea4d0909fc9d862007fb39a584fb0b73349a6aab336f0d90eaed",
+ "sha256:c66546e30addb04a16cd864f10f5821272a1bfe6462ee5605613b4f1cb6f7b48",
+ "sha256:c71d9d512b76f05fa00282227c2ae884abb60e09f08b5ca3132b7e7431ac7f0d",
+ "sha256:c8601a66fbfc0052bb7860d2eacd303fcde3c14e87fdde409eceff516d659e77",
+ "sha256:c88adbcb933f6b8612f6c593384bf824e562bb35fc8a0f55fac690ab5b3486e5",
+ "sha256:ca00fafd2756bc9649bf80f1cf72c647dce38635f0695d7ce804bc0f759aa756",
+ "sha256:ca8a23097c1f50e0fdb4de9e427537ca122a18df2eead06ed39c3a0bef6d9d3a",
+ "sha256:cda1e2f66bb4ba7261a0f4c2d052d5d909798fca557cbff68f8a79a87d66a18f",
+ "sha256:cdfc04f7647c29fb48da7a04082c34cdb16f878d3c6d098d62d5715c0ad3000c",
+ "sha256:cf62dacb3f9234f3fddd74e178e6d25c68f2067fde765f1d95f87b1381248f58",
+ "sha256:d00df2e4a81ffa56a6b1ec4d2bc29afdcb7f565e0b8cd3092fece2290c4c7a79",
+ "sha256:d248a109699ce9992304e79c1f8735c82cc4c1386cd8e27027329c0549f248a2",
+ "sha256:d63def9bbc6b35aef4d76dc740301a4185867e8870cbb8719ec9de672212fca8",
+ "sha256:d82f20c0060ffdaadaf642b88ab0aa52365b56dffae812e188e5bdb998043588",
+ "sha256:dbcf5371ea704759fcce772c66a07647751d1f5dbdec7818331c9b31ae996c77",
+ "sha256:e8914dad106dacb0775718e54bf15e528055c4e92fb2677842996f2d52da5069",
+ "sha256:ebe303cd9839af69dd1f7942acaa80b1ba90bacef2e7ded9347fbed4f1654672",
+ "sha256:ec55a81ac2b0f41b8d6fb29aad16e55417036c7563bad5568686931aa4ff08f7",
+ "sha256:effe182767d102cb65dfbbf74192237dbd22d4191928d59415aa7d7c861d8c88",
+ "sha256:f42b82f268689f429def9ecfb86fa65ceea0eaf3fed408b570fe113311bf5ce7",
+ "sha256:f6fe570e20e293eb50491ae14ddeef71a6a7e5f59d7e791393ffa99b13f1f8c2",
+ "sha256:f799d1d6c33d81e983d3682571cc7d993ae7ff772c19b3aabb767039c33f6d1e",
+ "sha256:f891b98f8bc6c9d521785816085e9657212621e93f223917fb8e32f318b2957e",
+ "sha256:fa263135b892686e11d5b84f6a1892523123a00b7e5882eff4fbdabb38667347",
+ "sha256:fa4c598ed77f74ec973247ca776341200b0f93ec3883e34c222907ce72cb92a4",
+ "sha256:fe56659ccadbee97908132135de4b875543353351e0c92e736b7c57aee298b5a",
+ "sha256:fe59a0c21a032024edb0c8e43f5dee5623fef0b65a1e3c1281836d9ce199af3b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.13.7"
},
"redis": {
"hashes": [
- "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
- "sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"
+ "sha256:7b8c87d19c45d3f1271b124858d2a5c13160c4e74d4835e28273400fa34d5228",
+ "sha256:cae3ee5d1f57d8caf534cd8764edf3163c77e073bdd74b6f54a87ffafdc5e7d9"
],
- "version": "==3.5.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"regex": {
"hashes": [
- "sha256:02951b7dacb123d8ea6da44fe45ddd084aa6777d4b2454fa0da61d569c6fa538",
- "sha256:0d08e71e70c0237883d0bef12cad5145b84c3705e9c6a588b2a9c7080e5af2a4",
- "sha256:1862a9d9194fae76a7aaf0150d5f2a8ec1da89e8b55890b1786b8f88a0f619dc",
- "sha256:1ab79fcb02b930de09c76d024d279686ec5d532eb814fd0ed1e0051eb8bd2daa",
- "sha256:1fa7ee9c2a0e30405e21031d07d7ba8617bc590d391adfc2b7f1e8b99f46f444",
- "sha256:262c6825b309e6485ec2493ffc7e62a13cf13fb2a8b6d212f72bd53ad34118f1",
- "sha256:2a11a3e90bd9901d70a5b31d7dd85114755a581a5da3fc996abfefa48aee78af",
- "sha256:2c99e97d388cd0a8d30f7c514d67887d8021541b875baf09791a3baad48bb4f8",
- "sha256:3128e30d83f2e70b0bed9b2a34e92707d0877e460b402faca908c6667092ada9",
- "sha256:38c8fd190db64f513fe4e1baa59fed086ae71fa45083b6936b52d34df8f86a88",
- "sha256:3bddc701bdd1efa0d5264d2649588cbfda549b2899dc8d50417e47a82e1387ba",
- "sha256:4902e6aa086cbb224241adbc2f06235927d5cdacffb2425c73e6570e8d862364",
- "sha256:49cae022fa13f09be91b2c880e58e14b6da5d10639ed45ca69b85faf039f7a4e",
- "sha256:56e01daca75eae420bce184edd8bb341c8eebb19dd3bce7266332258f9fb9dd7",
- "sha256:5862975b45d451b6db51c2e654990c1820523a5b07100fc6903e9c86575202a0",
- "sha256:6a8ce43923c518c24a2579fda49f093f1397dad5d18346211e46f134fc624e31",
- "sha256:6c54ce4b5d61a7129bad5c5dc279e222afd00e721bf92f9ef09e4fae28755683",
- "sha256:6e4b08c6f8daca7d8f07c8d24e4331ae7953333dbd09c648ed6ebd24db5a10ee",
- "sha256:717881211f46de3ab130b58ec0908267961fadc06e44f974466d1887f865bd5b",
- "sha256:749078d1eb89484db5f34b4012092ad14b327944ee7f1c4f74d6279a6e4d1884",
- "sha256:7913bd25f4ab274ba37bc97ad0e21c31004224ccb02765ad984eef43e04acc6c",
- "sha256:7a25fcbeae08f96a754b45bdc050e1fb94b95cab046bf56b016c25e9ab127b3e",
- "sha256:83d6b356e116ca119db8e7c6fc2983289d87b27b3fac238cfe5dca529d884562",
- "sha256:8b882a78c320478b12ff024e81dc7d43c1462aa4a3341c754ee65d857a521f85",
- "sha256:8f6a2229e8ad946e36815f2a03386bb8353d4bde368fdf8ca5f0cb97264d3b5c",
- "sha256:9801c4c1d9ae6a70aeb2128e5b4b68c45d4f0af0d1535500884d644fa9b768c6",
- "sha256:a15f64ae3a027b64496a71ab1f722355e570c3fac5ba2801cafce846bf5af01d",
- "sha256:a3d748383762e56337c39ab35c6ed4deb88df5326f97a38946ddd19028ecce6b",
- "sha256:a63f1a07932c9686d2d416fb295ec2c01ab246e89b4d58e5fa468089cab44b70",
- "sha256:b2b1a5ddae3677d89b686e5c625fc5547c6e492bd755b520de5332773a8af06b",
- "sha256:b2f4007bff007c96a173e24dcda236e5e83bde4358a557f9ccf5e014439eae4b",
- "sha256:baf378ba6151f6e272824b86a774326f692bc2ef4cc5ce8d5bc76e38c813a55f",
- "sha256:bafb01b4688833e099d79e7efd23f99172f501a15c44f21ea2118681473fdba0",
- "sha256:bba349276b126947b014e50ab3316c027cac1495992f10e5682dc677b3dfa0c5",
- "sha256:c084582d4215593f2f1d28b65d2a2f3aceff8342aa85afd7be23a9cad74a0de5",
- "sha256:d1ebb090a426db66dd80df8ca85adc4abfcbad8a7c2e9a5ec7513ede522e0a8f",
- "sha256:d2d8ce12b7c12c87e41123997ebaf1a5767a5be3ec545f64675388970f415e2e",
- "sha256:e32f5f3d1b1c663af7f9c4c1e72e6ffe9a78c03a31e149259f531e0fed826512",
- "sha256:e3faaf10a0d1e8e23a9b51d1900b72e1635c2d5b0e1bea1c18022486a8e2e52d",
- "sha256:f7d29a6fc4760300f86ae329e3b6ca28ea9c20823df123a2ea8693e967b29917",
- "sha256:f8f295db00ef5f8bae530fc39af0b40486ca6068733fb860b42115052206466f"
- ],
- "version": "==2020.11.13"
+ "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad",
+ "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4",
+ "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd",
+ "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc",
+ "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d",
+ "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066",
+ "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec",
+ "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9",
+ "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e",
+ "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8",
+ "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e",
+ "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783",
+ "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6",
+ "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1",
+ "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c",
+ "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4",
+ "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1",
+ "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1",
+ "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7",
+ "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8",
+ "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe",
+ "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d",
+ "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b",
+ "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8",
+ "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c",
+ "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af",
+ "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49",
+ "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714",
+ "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542",
+ "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318",
+ "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e",
+ "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5",
+ "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc",
+ "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144",
+ "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453",
+ "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5",
+ "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61",
+ "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11",
+ "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a",
+ "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54",
+ "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73",
+ "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc",
+ "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347",
+ "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c",
+ "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66",
+ "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c",
+ "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93",
+ "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443",
+ "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc",
+ "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1",
+ "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892",
+ "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8",
+ "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001",
+ "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa",
+ "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90",
+ "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c",
+ "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0",
+ "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692",
+ "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4",
+ "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5",
+ "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690",
+ "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83",
+ "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66",
+ "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f",
+ "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f",
+ "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4",
+ "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee",
+ "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81",
+ "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95",
+ "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9",
+ "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff",
+ "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e",
+ "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5",
+ "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6",
+ "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7",
+ "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1",
+ "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394",
+ "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6",
+ "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742",
+ "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57",
+ "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b",
+ "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7",
+ "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b",
+ "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244",
+ "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af",
+ "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185",
+ "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8",
+ "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.10.31"
},
"requests": {
"hashes": [
- "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
- "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.25.1"
+ "version": "==2.28.1"
},
"requests-file": {
"hashes": [
@@ -760,226 +1112,189 @@
},
"s3transfer": {
"hashes": [
- "sha256:2482b4259524933a022d59da830f51bd746db62f047d6eb213f2f8855dcb8a13",
- "sha256:921a37e2aefc64145e7b73d50c71bb4f26f46e4c9f414dc648c6245ff92cf7db"
+ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
+ "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
- "version": "==0.3.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.6.0"
},
"schedule": {
"hashes": [
- "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
- "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ "sha256:617adce8b4bf38c360b781297d59918fbebfb2878f1671d189f4f4af5d0567a4",
+ "sha256:e6ca13585e62c810e13a08682e0a6a8ad245372e376ba2b8679294f377dfc8e4"
],
- "version": "==0.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.1.0"
},
"schema": {
"hashes": [
- "sha256:3a03c2e2b22e6a331ae73750ab1da46916da6ca861b16e6f073ac1d1eba43b71",
- "sha256:b536f2375b49fdf56f36279addae98bd86a8afbd58b3c32ce363c464bed5fc1c"
+ "sha256:f06717112c61895cabc4707752b88716e8420a8819d71404501e114f91043197",
+ "sha256:f3ffdeeada09ec34bf40d7d79996d9f7175db93b7a5065de0faa7f41083c1e6c"
],
- "version": "==0.7.2"
+ "version": "==0.7.5"
},
"selectolax": {
"hashes": [
- "sha256:01b26820667dcd8dc0ec94ed874ffc8e45043f0da70466544a9328a79282ff41",
- "sha256:03e4c0b6d8feb16472482c89a1bf0752335d015172263388c94b8089224ed9e6",
- "sha256:061efe18e01a624e33317d1f98f20d67f025e228f5cf87f198caceadff9e77f5",
- "sha256:108f0ed757c5e74cd3d15f3ddb615c891711ae9647fb002aca6dbad5c7f0084c",
- "sha256:13e6a6ec4b8fc43ef3f6586e17ba85832bbcdf8074d9a31a159d87dd81bf2627",
- "sha256:231ce804a5e186afa4e7f1639f3a2fdefc5151c1094746fa09821c7c9f5dbeb6",
- "sha256:274d70e46a94a7b673585957574e571b1838afb5862b9edc7477f704a2e8be3f",
- "sha256:290b9bc9df879c8538899b5d22e8fa272e07c9edc438396d9b9ad631a7689837",
- "sha256:2fe935472e9c2c14caf38b65a5ea836f0c3d56081945a8588e14f4136e34ba6b",
- "sha256:37cb0fd1d933ad7321caa68773fda490d686286eaf4d77922686ad14506c4a2c",
- "sha256:38661265f318459cd93b1a87b20d8b7b5adeaa353cc96e2d5087a05eef9ce8a3",
- "sha256:3b21ba8862be4445482e6954c61562851cebd9c9e5db73b0865ea4729e7c85b0",
- "sha256:4208bfab7c5e14d54104b7959ba1d66f67a51044cb1fccbab62d12c6bd905f02",
- "sha256:4233599d6507e11a6fab67d9e933d8f445859868b4162eb71c849a832935b575",
- "sha256:4714c5e6b18ad0ca9f2919b39f333590025e46cb0bb248ffe973333bbf18a491",
- "sha256:4b9f60a689c0453b6e2a6b92dd2407c82167f3d7624b27184842b2b58d5bc353",
- "sha256:4ebb88d954dabffa3bafad6cdd758612a7d3b84ceee692c5818bbf0fa93c5f6b",
- "sha256:519335c313c49151e0a282bef88043eab8756732f24eeb42d2a17e68b3ab174e",
- "sha256:5e2fb6a27bc7760d57f8cc53adcf5b300a021a3f4102df0e5dd8abb436041c28",
- "sha256:60ba2ce5060bac7d56dedefe1403602aac1b999a60596294ce3a9520e2c95d71",
- "sha256:6f7f7a1a030c5612529c0e9df46d690b54d22416d500095ddf3985527f8fb78f",
- "sha256:804f8e954428a1a325a62a88af39e1fef87c22f0689ee3c3a1d8968ee9648f6e",
- "sha256:88cc811bb3f9c4eac303dde5ba3ecde0972dba8cebf2fb8001467e752c888838",
- "sha256:8b85a1356e180d235d9ab92bc3dd90d07e78cab1ef324ae9d12207607c9f26f6",
- "sha256:8d8e3c7c43805628f2112cda81dba0b8f6620912c24ab2d6635f351985097971",
- "sha256:90da202496bb99a0924cd26c471f455f64308ed13a24500852635aef5014a43f",
- "sha256:98e8b60fca5ca6e2f0a2a1882f0c1b771612e5016bd6605545e7c20a8baac244",
- "sha256:a18f75af342476356e5a437fc5215a3b79b58f52b56d9ea6e1a985cc21895952",
- "sha256:a36581e0a4f74c5a67d22048fbf34221f9d480bde05acc57702b1cffdcb9ecf5",
- "sha256:a6724cb313cd7805c7cf4252fdf162e7253cf3a933b7c25ac954feed3edc23ce",
- "sha256:b8632b165d5da9ecbfb671dbfa879a874cd63d2ea66a8d21b065da1236949947",
- "sha256:bba6127957c3209e141e42077d952cb1df4a5dc23c522ca9038c8013509588d8",
- "sha256:c47c7602e8cf8bdce03716b0240d2067eec92f3185cffe34813c60706559ae6a",
- "sha256:c49ac91cb291eae5c396aa87725ad066ba2fd9690289f3ffcde0022e4276b56e",
- "sha256:d4144619f88bb94ee2c29cccc23b00a020d6d140d84eda8d7fc4da05dc15f352",
- "sha256:df57fdbbf772b72993e44cdb326b4937d225e0dd2083cce56100593fe791326a",
- "sha256:e577ea359151e4df515eabc0c6ea1ddda0577971597c5e9908498a80477befc6",
- "sha256:e6857ac61acbf747ea56f6c8a72968e7a6ba88053a9a2b5b44091bfb97fb1c87",
- "sha256:ec2e3f6e49ee252c2fd0c0f297513150ec04e59c7aa0236baebeaaf21b83ffef",
- "sha256:ecdbad6c95b93256df4c3cb14612215bcd754093415615c55a191bb17fd0ebdc",
- "sha256:f22653fd48a7f835891bab16095c6f983994d68d16925447e537eb6e3ab79fc4",
- "sha256:f6c637636cc3bd0025dc9bd07fde28d482c93a6c21cf2e88b827a06766b2b314",
- "sha256:f83648e412aa610bdff1259dc412383fb290427c05f54e4fad1419b16aca19fe",
- "sha256:f8f8488fa5859b0da7e4a1bd265b5c0bba45dbf8286e6cee17bf95bcb3d5e797"
+ "sha256:010b008aca04be6cf9727d6f206a583d79a82d397126a101f57f117113a082bb",
+ "sha256:0878aa1ab3906831b20ad9e316a77c8401030dd388f3c1c72ba51bc08d497584",
+ "sha256:087e663c0ba6d9d79294508b0a3145079e838950a0e2fc7b8b1485da3fe24254",
+ "sha256:0a8dddd34dea642429629aae21cf940668eaa1c66ab0bcf9970d72f38676697d",
+ "sha256:14c9368f9dd224f895ef1431b1961d6e9a56fb26a95b5c04900def7b8961744c",
+ "sha256:17ac0b2b4222ba2c16852c0035dcd31d9e100544e6a5138f6e01f6b1648691b5",
+ "sha256:1ba1cd707a0d0090cffb2851ec6ccfdc334ed0c2ea08ae8705a9f6c97a997f77",
+ "sha256:1d38157e2358dacf55e782d332b41391821b2ef237e34e47ff276b2184c96542",
+ "sha256:1f1ec20cc75e1866f7758e543907da222c5d8072e580cf6814f2f142036c695f",
+ "sha256:1fa1737b7031b467d8613919503c85482a59c65ac91fe60074180e625e2533c6",
+ "sha256:221051ffe8c2950e9ebe41e08103397a7b287dca05a9e8084bb9e925f2d9c556",
+ "sha256:264918c1e9e6f6657f47116e4dbd74b57c660d3e86f9cc78209f132c56c8e9e5",
+ "sha256:2d8c7ce06bdf83d3cd2a617211eec48c875826bae54c74e56aec2635daac2f31",
+ "sha256:31fb0fbc88674b3346e379664c5837070e79b2f65eab3e29b7c43e1b4fc1137c",
+ "sha256:3600747c5072725580f8dc249a40ae123840f22edab950f43b349d356f44268b",
+ "sha256:3d65d0c57cfa1b05beb5c72d3cb566f4fdaf16e5112082f300cfa6bd94836aff",
+ "sha256:3daaf7ec54565d3f15f9ce046f6a8e469d966dc4fc879af8c7f753d37994f70e",
+ "sha256:418738a2f46beea2444a1587adb4f509bdd8e7ddffac071dba097c1a3ddb8cfc",
+ "sha256:46776ca482a76b3f522e4d8f90474716e4da51dc2823f3ecc6a2ff38ef0663b7",
+ "sha256:46bacca9e9f077ff2c5a973c05b8862425f077c58f2dca8059b992ceaca6b6de",
+ "sha256:4c5c68f0139d0928298ef5e95137996e0efb6f8db364b1470221e8710834a0ab",
+ "sha256:51c33d33e4e4eec0d9c1b6accdda5c93f4e3a00b28e99fc4ebb2b95d1d4ef885",
+ "sha256:585a75f4aff85b48d0fc8f3e9afbd1e2c05902a332982d04bab93e8e1db2e4a4",
+ "sha256:5acbe02c26b43428c2f49e8f09a81bd47be7ea969c6798cde1a23c2b33d25c79",
+ "sha256:6111ac9e5ca02b13d8e3057c1e20d6608435c64a11f92460a59951a7209c2cf3",
+ "sha256:67c32c29bc9011ed1b6fd67a961073e69d67bf60bf09f3db54d6240c034719f4",
+ "sha256:68c42af2cabecf04528dff2d0bbebbecfbafc394a5192b6a5b3e1dcd19eeb766",
+ "sha256:709b1680a16f210c43e4f3240dfc15e3312ccd43c9ea20c8e20c81470214cfc6",
+ "sha256:762e91a0ac0caa2d8731568e5b2ad0cec6fc06465a9dd89280118ced4b7e0849",
+ "sha256:7d47e489a8b0181992a3384987c854bd88211685e1c32dcdcb8746ec98dbcf7e",
+ "sha256:7ebe824763782f0e6ad2accd57d0cef3a61922b72be99ccafebe0154e9b8aef6",
+ "sha256:7f1a35be9413bcd56f225b1509740ea8999a6f7558e0f0a50a4ca80b91bf11be",
+ "sha256:81c7847ff0f3561559bd98015aa3fe0a2dfb26966156f7704f7f65339d48e81c",
+ "sha256:9246bf586afaacfdc0e6fb17806ee0d3e1736d3d13a87c8e96214596d50576b7",
+ "sha256:9baff22ae7015e8f2697d5db0804ee379d53fa6e54f1dc7e9f61ee8ccb1bdb2e",
+ "sha256:a4634d7c7e9d2eb65d0fc7fe0d88641eb413cb7250fbfc66b3b4d88d49e4c724",
+ "sha256:a7fa03253260c3351f61cef36865b27ad4585516e9ac4a77244d237bfaf37f13",
+ "sha256:abac4b7afe430dd135f148d4001b593b09c8f64fccd63b15fbb03b77735e3405",
+ "sha256:ad0cfc7f66a2863d199af819c79bfa160bcc830e0f83fd5391cdd80e545af758",
+ "sha256:adabfb5635d00da49bddef3844dc65ca3da81acd889ea7be2a74ef9456558f36",
+ "sha256:ae58e7cc282a768a68abbfa39eff895788a39658c5a235524c21b09d182b3d3a",
+ "sha256:b348074bc3a0e16e9af1a2f57e0da18f5def97e415c6435dadc68aead7ccf060",
+ "sha256:b48e4c8df2c226552ac18636c2ebe9d100ff3daa8742616687bd2cbf74a81e2f",
+ "sha256:c23d9f82aea887347151538a58b15a8dbee4261e4114705c0974dee81eb796e0",
+ "sha256:c2b589be0dd45d62ec43a6446f09919b5be809c708d8ff6a7cb86acd9150091b",
+ "sha256:d13904fc037bcebc6d79e83c0a19e64cc9d4771cd7f27b325c63d1071ec0d0f0",
+ "sha256:d3506e831b972c1eb22538b25e7c991289b72b2e028bd27b633dfbd21c1a511a",
+ "sha256:d809fbf258c28190160b3fe5d34adddb1da44ed7a2f800b7125e0fac6e940016",
+ "sha256:da688ca957d68b8072dc9658506c07326f6332ff3fe03214fec375a4ccc67f8a",
+ "sha256:e001a40b25e478f8390c3898c5852cf9a226668ba02fdc4d8e3a4788ce64207a",
+ "sha256:e805b106edac716047afc6e9e49953242207909bfbb70bf47c53f231e2d27d74",
+ "sha256:eb86cacac6ed203c386afe6704732fb05d831006c65869f15f41d15e9e72973b",
+ "sha256:f5cef3310fc41f71e8fc19d05534d100f6c02789d46041777b0bbd70961a94ec",
+ "sha256:f76b0ad63b55e45d3c02e50ca8b8ef64a500aed9a5f50818173b66949470f8e4",
+ "sha256:fad7fb68e929082e6474e1392dd433d465b06b59e26158ef67813c0c8e5b7f66",
+ "sha256:fb3b3425ee21f5098531ce80dc48d99a555b8b2300deb0ddf84b6bc503f0a848",
+ "sha256:fc53731aa81617694667d4c56d21a9e26df840a219f4b62588af80c6781ba613"
],
"index": "ia",
- "version": "==0.2.10"
+ "version": "==0.3.11"
},
"sentry-sdk": {
"extras": [],
"hashes": [
- "sha256:0a711ec952441c2ec89b8f5d226c33bc697914f46e876b44a4edd3e7864cf4d0",
- "sha256:737a094e49a529dd0fdcaafa9e97cf7c3d5eb964bd229821d640bc77f3502b3f"
+ "sha256:5bbe4b72de22f9ac1e67f2a4e6efe8fbd595bb59b7b223443f50fe5802a5551c",
+ "sha256:9f0b960694e2d8bb04db4ba6ac2a645040caef4e762c65937998ff06064f10d6"
],
"index": "ia",
- "version": "==0.19.5"
+ "version": "==1.12.1"
},
"six": {
"hashes": [
- "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
- "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
- "version": "==1.15.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
},
"soupsieve": {
"hashes": [
- "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851",
- "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"
- ],
- "markers": "python_version >= '3.0'",
- "version": "==2.1"
- },
- "sqlalchemy": {
- "hashes": [
- "sha256:04f995fcbf54e46cddeb4f75ce9dfc17075d6ae04ac23b2bacb44b3bc6f6bf11",
- "sha256:0c6406a78a714a540d980a680b86654feadb81c8d0eecb59f3d6c554a4c69f19",
- "sha256:0c72b90988be749e04eff0342dcc98c18a14461eb4b2ad59d611b57b31120f90",
- "sha256:108580808803c7732f34798eb4a329d45b04c562ed83ee90f09f6a184a42b766",
- "sha256:1418f5e71d6081aa1095a1d6b567a562d2761996710bdce9b6e6ba20a03d0864",
- "sha256:17610d573e698bf395afbbff946544fbce7c5f4ee77b5bcb1f821b36345fae7a",
- "sha256:216ba5b4299c95ed179b58f298bda885a476b16288ab7243e89f29f6aeced7e0",
- "sha256:2ff132a379838b1abf83c065be54cef32b47c987aedd06b82fc76476c85225eb",
- "sha256:314f5042c0b047438e19401d5f29757a511cfc2f0c40d28047ca0e4c95eabb5b",
- "sha256:318b5b727e00662e5fc4b4cd2bf58a5116d7c1b4dd56ffaa7d68f43458a8d1ed",
- "sha256:3ab5b44a07b8c562c6dcb7433c6a6c6e03266d19d64f87b3333eda34e3b9936b",
- "sha256:426ece890153ccc52cc5151a1a0ed540a5a7825414139bb4c95a868d8da54a52",
- "sha256:491fe48adc07d13e020a8b07ef82eefc227003a046809c121bea81d3dbf1832d",
- "sha256:4a84c7c7658dd22a33dab2e2aa2d17c18cb004a42388246f2e87cb4085ef2811",
- "sha256:54da615e5b92c339e339fe8536cce99fe823b6ed505d4ea344852aefa1c205fb",
- "sha256:5a7f224cdb7233182cec2a45d4c633951268d6a9bcedac37abbf79dd07012aea",
- "sha256:61628715931f4962e0cdb2a7c87ff39eea320d2aa96bd471a3c293d146f90394",
- "sha256:62285607a5264d1f91590abd874d6a498e229d5840669bd7d9f654cfaa599bd0",
- "sha256:62fb881ba51dbacba9af9b779211cf9acff3442d4f2993142015b22b3cd1f92a",
- "sha256:68428818cf80c60dc04aa0f38da20ad39b28aba4d4d199f949e7d6e04444ea86",
- "sha256:6aaa13ee40c4552d5f3a59f543f0db6e31712cc4009ec7385407be4627259d41",
- "sha256:70121f0ae48b25ef3e56e477b88cd0b0af0e1f3a53b5554071aa6a93ef378a03",
- "sha256:715b34578cc740b743361f7c3e5f584b04b0f1344f45afc4e87fbac4802eb0a0",
- "sha256:758fc8c4d6c0336e617f9f6919f9daea3ab6bb9b07005eda9a1a682e24a6cacc",
- "sha256:7d4b8de6bb0bc736161cb0bbd95366b11b3eb24dd6b814a143d8375e75af9990",
- "sha256:81d8d099a49f83111cce55ec03cc87eef45eec0d90f9842b4fc674f860b857b0",
- "sha256:888d5b4b5aeed0d3449de93ea80173653e939e916cc95fe8527079e50235c1d2",
- "sha256:95bde07d19c146d608bccb9b16e144ec8f139bcfe7fd72331858698a71c9b4f5",
- "sha256:9bf572e4f5aa23f88dd902f10bb103cb5979022a38eec684bfa6d61851173fec",
- "sha256:bab5a1e15b9466a25c96cda19139f3beb3e669794373b9ce28c4cf158c6e841d",
- "sha256:bd4b1af45fd322dcd1fb2a9195b4f93f570d1a5902a842e3e6051385fac88f9c",
- "sha256:bde677047305fe76c7ee3e4492b545e0018918e44141cc154fe39e124e433991",
- "sha256:c389d7cc2b821853fb018c85457da3e7941db64f4387720a329bc7ff06a27963",
- "sha256:d055ff750fcab69ca4e57b656d9c6ad33682e9b8d564f2fbe667ab95c63591b0",
- "sha256:d53f59744b01f1440a1b0973ed2c3a7de204135c593299ee997828aad5191693",
- "sha256:f115150cc4361dd46153302a640c7fa1804ac207f9cc356228248e351a8b4676",
- "sha256:f1e88b30da8163215eab643962ae9d9252e47b4ea53404f2c4f10f24e70ddc62",
- "sha256:f8191fef303025879e6c3548ecd8a95aafc0728c764ab72ec51a0bdf0c91a341"
- ],
- "version": "==1.3.22"
+ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
+ "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.3.2.post1"
},
"surt": {
"hashes": [
- "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720",
- "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720"
],
"version": "==0.3.1"
},
- "tldextract": {
+ "tld": {
"hashes": [
- "sha256:cfae9bc8bda37c3e8c7c8639711ad20e95dc85b207a256b60b0b23d7ff5540ea",
- "sha256:e57f22b6d00a28c21673d2048112f1bdcb6a14d4711568305f6bb96cf5bb53a1"
+ "sha256:266106ad9035f54cd5cce5f823911a51f697e7c58cb45bfbd6c53b4c2976ece2",
+ "sha256:69fed19d26bb3f715366fb4af66fdeace896c55c052b00e8aaba3a7b63f3e7f0",
+ "sha256:826bbe61dccc8d63144b51caef83e1373fbaac6f9ada46fca7846021f5d36fef",
+ "sha256:843844e4256c943983d86366b5af3ac9cd1c9a0b6465f04d9f70e3b4c1a7989f",
+ "sha256:a92ac6b84917e7d9e934434b8d37e9be534598f138fbb86b3c0d5426f2621890",
+ "sha256:b6650f2d5392a49760064bc55d73ce3397a378ef24ded96efb516c6b8ec68c26",
+ "sha256:ef5b162d6fa295822dacd4fe4df1b62d8df2550795a97399a8905821b58d3702"
],
- "version": "==3.1.0"
+ "markers": "python_version >= '2.7' and python_version < '4'",
+ "version": "==0.12.6"
},
- "toml": {
+ "tldextract": {
"hashes": [
- "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
- "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ "sha256:47aa4d8f1a4da79a44529c9a2ddc518663b25d371b805194ec5ce2a5f615ccd2",
+ "sha256:78aef13ac1459d519b457a03f1f74c1bf1c2808122a6bcc0e6840f81ba55ad73"
],
- "version": "==0.10.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.4.0"
},
"tqdm": {
"hashes": [
- "sha256:0cd81710de29754bf17b6fee07bdb86f956b4fa20d3078f02040f83e64309416",
- "sha256:f4f80b96e2ceafea69add7bf971b8403b9cba8fb4451c1220f91c79be4ebd208"
+ "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4",
+ "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"
],
- "version": "==4.55.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==4.64.1"
},
"trafilatura": {
"hashes": [
- "sha256:0561c80c284bd3facdb7e16dbc3105758548d1548bbc92beb3522c8ac2c27cea",
- "sha256:70fc752049772ec17225417d211af460f9e0a44e0d041ffb6f823fc25081df3b"
+ "sha256:a66189e4b9d591dce648f0cc79fb52a486e679708090189bc4fcd88068f095ef",
+ "sha256:c2bc0cbac6248363d938666cbedbb067ad8aefe31667c88038135b93efd475c3"
],
"index": "ia",
- "version": "==0.6.1"
+ "version": "==1.3.0"
},
"twitter": {
"hashes": [
- "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
- "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
- ],
- "version": "==1.18.0"
- },
- "typed-ast": {
- "hashes": [
- "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
- "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
- "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
- "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
- "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
- "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
- "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
- "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
- "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
- "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
- "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
- "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
- "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
- "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
- "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
- "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
- "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
- "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
- "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
- "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
- "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
- "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
- "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
- "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
- "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
- "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
- "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
- "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
- "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
- "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
- ],
- "version": "==1.4.1"
+ "sha256:1d9a3e45f2c440f308a7116d3672b0d1981aba8ac41cb7f3ed270ed50693f0e0",
+ "sha256:80ddd69ae2eeb88313feedeea31bf119fd6e79541ee5b37abb9c43d233194e10"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==1.19.6"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
+ },
+ "tzdata": {
+ "hashes": [
+ "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d",
+ "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.7"
},
"tzlocal": {
"hashes": [
- "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
- "sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4"
+ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
+ "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
- "version": "==2.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.2"
},
"urlcanon": {
"hashes": [
@@ -990,11 +1305,11 @@
},
"urllib3": {
"hashes": [
- "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
- "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.26.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"warctools": {
"hashes": [
@@ -1007,28 +1322,22 @@
"brotli"
],
"hashes": [
- "sha256:f8979bb2a20434e6af5d14d3cfd0058fd81cfd3c5ecefc7babb0fedb3ac320a1"
+ "sha256:3a3f149508d68ec53f5cdf434a45e5bb906beef731327d7bd2ef6b751c98281b"
],
"index": "ia",
- "version": "==0.6.4"
- },
- "wayback-esp": {
- "hashes": [
- "sha256:4b735baecf34d4ad08ec4338891fbcb7879f9a58441bc5c488d067f7e2cd5318"
- ],
- "version": "==0.2.12"
+ "version": "==0.8.6.1"
},
"wayback-search-js": {
"hashes": [
- "sha256:e3f48201b02f2ae437f4e1c171c9d7bd3d8126437510c933c3e3b45334552491"
+ "sha256:a474ba8da58f9cc27b1dce7f87a8cc7d119715ab4bab750dcc1d90f002074161"
],
- "version": "==2.13.0"
+ "version": "==3.1.21"
},
"wbex-client": {
"hashes": [
- "sha256:619ead0408195f4eb87198a99e497c649961da45fcf97cb9bc937ef9e06a9e7f"
+ "sha256:8c4028d744dda05cca932b411a826f9478a65cbc018784bff9528e973c7f9c36"
],
- "version": "==0.1.6"
+ "version": "==0.1.6.1"
},
"wcwidth": {
"hashes": [
@@ -1039,89 +1348,93 @@
},
"werkzeug": {
"hashes": [
- "sha256:2de2a5db0baeae7b2d2664949077c2ac63fbd16d98da0ff71837f7d1dea3fd43",
- "sha256:6c80b1e5ad3665290ea39320b91e1be1e0d5f60652b964a3070216de83d2e47c"
+ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+ "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
],
- "version": "==1.0.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
},
"zstandard": {
"hashes": [
- "sha256:064aac12b8e7813fa3870e7479e9cbd3803e33212b68e555b408711ea8f6cb54",
- "sha256:0b3ae587556a6f45cd982d7684b1318793430d0ae9e376dbc3d877b48ac6d576",
- "sha256:0b57df1f9530669d61f8708eb15ded6584db4a6733cc5155eb8561d31f292557",
- "sha256:0e5b8fd428d0d00fb7dabc0898de9e87659cb54738d527becff37d3d90df8e88",
- "sha256:17e8f29aae79d870daa3ab48c0dbf83594bf956c2c2125ae45cdfebd2b62d8ed",
- "sha256:1be45b237fea45c705d83215450a9381c2787bbf0720824b1fe23ed72f8db0b7",
- "sha256:1c065de617b7367c4da4de687a071932e48ae200d09c0afbc24415d98aec470d",
- "sha256:2075be64372206af3df40fef0fee657b44845d3e6d98b4cc8aba220be861de2d",
- "sha256:24ab8f1c7c970822bd55dbb091f7eb271b417e777e8b3ae6722e60d67f747c05",
- "sha256:2826d664eb84f9efe0fae47cf20c27f3662aae3556fbcc4cecd5318fbc9239f3",
- "sha256:3382ce6e44e9e847dce848bc2638403aa9320cb38edcc34b71e13be5793619e0",
- "sha256:36cd223d7fd0fe0e32e82993240e9a24503269c93431e62369088e2299cf4605",
- "sha256:391c30620e3ad6bc53804f32e3f74cbbaa713d95f46ac5f2e54e735d1dfc51c0",
- "sha256:3948000d753b9110e1eb43a6cba6fdb64c895faebb47628a96550edc5238b78a",
- "sha256:3bd044ef32bd6738c3db2cb2d4bc77812e9a3132df942303bbfcd1a484023b60",
- "sha256:403fa9544ecdedcc5fdc48f5e41e092658ac48222cfe6e75fb5710cb3d14c700",
- "sha256:41eab10e6570e14dd77a346f3dbb1eab3f23a652bce07ba47c8c23116b0cee9c",
- "sha256:477db538b596767d036379165a27aa2e19edbae50bec4cea195a986ba50bbad6",
- "sha256:4b054fd8cf274b958a3d7a201f8b42a30ebf8f76d87770075e1aca6017006e97",
- "sha256:4e6d6b0e541b00d0096a260d5f6eb32f737bfcdb2e5b87a7b7be77ef669c7a6c",
- "sha256:5be097127be1659bc6cffb5d885c781e61947597e2fcd1ecf48713313e53657d",
- "sha256:5dd700e52ec28c64d43f681ccde76b6436c8f89a332d6c9e22a6b629f28daeb5",
- "sha256:657a49b1df5a82985ea6495c6c1497a17e34e41a0bd8ef95a640342a19b8e6a4",
- "sha256:6f437168752e50ad6a47d054f4a41933693b1675f65663c117067747d95f057c",
- "sha256:70dfe74b24971476a6a20d42abb964c9ac0fb1af7b89228e5845748377543bd0",
- "sha256:7161d71debb94c456cbddd8a239e89219f37f0b1a4c0620a2c1400801aeeec7d",
- "sha256:7309bf511c8b332be2b5a834efbd7ee0cd43db2c811dd916fd0f48acd43e8722",
- "sha256:7c3c9657417bf1eccb94ad64544e12efa8ea3e16612944b32e253314472a54e5",
- "sha256:8486a01696e3cdfa47b93caa8f5064c9d277bad1c39eb31947bf2b8f019e3510",
- "sha256:85b37acd054f8f778e5c9832e17fb651f321a3daafa0eb94360eeffce141b0cf",
- "sha256:8df3114dfff411aa9827d754bb8fdcdaa15e63c96d7730778fe322f4c85360d8",
- "sha256:90f0bb1adcfea326c6548a45cc35474bec56a34d80310b6e78abab313da780fc",
- "sha256:95939a7e3972ec20e2e959ee9cd0fd858b25ff3a6f5040c5c78fcab51eeab030",
- "sha256:a51a09a3be208e627ebb518a78c639d240584f5d1da8106dcafa31d22103b4df",
- "sha256:a72cb707cc0a9d06e3912fe5b6c1648d70ac512f3e180018c82fe926926be12c",
- "sha256:a820ef78f39c29469caacb0bf43ffd024b78f242393c605daa748588b3247306",
- "sha256:aab21dd5724aa5bdd0aac16f5d175e5df0715fc614910220a918d50f08321982",
- "sha256:ac9b88a72f2dcfa3facbe6af96d59e82459e5815c15aa59481cc6080937ee02e",
- "sha256:b508a826c4b99835e3d8a8d415a6e516cacad4a95ef5ed01f60f9b067f200a51",
- "sha256:bd4da25cc46e972b029f8aa9f103c5977dbe461e1916ff7edec24065071b4a08",
- "sha256:cf67443d06b88218eb8915da2d968dcf6fdc384fb245f97155617ff3b8d77e92",
- "sha256:d2db7bcdc9b3e5a782d71df0163a6587b8b2f759cc4a819859e27e6ad2f778e6",
- "sha256:d2ec8309309fc7254d21286d6b3e5c28e4019cd8e266d1a860456a69ea7c2400",
- "sha256:d2fd76d29f4e8d7c4aac42617a0439506144146032b5d7b9b0a42f37f916fdb2",
- "sha256:d34848645f3507dc85baa8c67426f0685b08583e930fa3a1ab5048c5f0ba8fc1",
- "sha256:d3999f92ab7aab2a99ac7f7730b3bee8d6bd3e52953ed0e87ab881ca4244a315",
- "sha256:d4a7065d7fc991edb93483dbb7bc37dd091a2bac9572d9b9df243e6565d30522",
- "sha256:d78db92ac27cdcd55333b7e642cd400719842e692e8836f0b249e459b26d384b",
- "sha256:d7fecb5172dc885665581437fe96bf8f03ffc0022b723964c272accbb62713b4",
- "sha256:db1b3442441577d81bdae85fc7a4bd553e3161ec745e9dd1f2f93889248363fe",
- "sha256:e3731e0dc1c200e5c2f56ca36bed6c28903f764769f534fbf9ed4178f193e8aa",
- "sha256:e3963c919f65367587cf987a71991e69385f19cec9ad8166249b83e176cdbcd8",
- "sha256:e80ade52a06fb433c9ad7d6c8cfb3dafa34f05bedce543e95a670972ba41d65d",
- "sha256:ec1a20936484f3804fba4f29f7d8ed67c70e44536b0f0191a13eff4dc61c815c",
- "sha256:ed14a62f8bf2462f19373c337527ff684deb6d0d6b973fbcaece1f561c30f405",
- "sha256:fb5f0d29bcbfba6ef9beccba55f567d089747034add5cd7e8dc58842bb745803",
- "sha256:fbbe18afb67329577ab6a907f348175d3f6044d179a9b56b02206ff9e67c5b12"
+ "sha256:04c298d381a3b6274b0a8001f0da0ec7819d052ad9c3b0863fe8c7f154061f76",
+ "sha256:0fde1c56ec118940974e726c2a27e5b54e71e16c6f81d0b4722112b91d2d9009",
+ "sha256:126aa8433773efad0871f624339c7984a9c43913952f77d5abeee7f95a0c0860",
+ "sha256:1a4fb8b4ac6772e4d656103ccaf2e43e45bd16b5da324b963d58ef360d09eb73",
+ "sha256:2e4812720582d0803e84aefa2ac48ce1e1e6e200ca3ce1ae2be6d410c1d637ae",
+ "sha256:2f01b27d0b453f07cbcff01405cdd007e71f5d6410eb01303a16ba19213e58e4",
+ "sha256:31d12fcd942dd8dbf52ca5f6b1bbe287f44e5d551a081a983ff3ea2082867863",
+ "sha256:3c927b6aa682c6d96225e1c797f4a5d0b9f777b327dea912b23471aaf5385376",
+ "sha256:3d5bb598963ac1f1f5b72dd006adb46ca6203e4fb7269a5b6e1f99e85b07ad38",
+ "sha256:401508efe02341ae681752a87e8ac9ef76df85ef1a238a7a21786a489d2c983d",
+ "sha256:4514b19abe6dbd36d6c5d75c54faca24b1ceb3999193c5b1f4b685abeabde3d0",
+ "sha256:47dfa52bed3097c705451bafd56dac26535545a987b6759fa39da1602349d7ba",
+ "sha256:4fa496d2d674c6e9cffc561639d17009d29adee84a27cf1e12d3c9be14aa8feb",
+ "sha256:55a513ec67e85abd8b8b83af8813368036f03e2d29a50fc94033504918273980",
+ "sha256:55b3187e0bed004533149882ef8c24e954321f3be81f8a9ceffe35099b82a0d0",
+ "sha256:593f96718ad906e24d6534187fdade28b611f8ed06e27ba972ba48aecec45fc6",
+ "sha256:5e21032efe673b887464667d09406bab6e16d96b09ad87e80859e3a20b6745b6",
+ "sha256:60a86b7b2b1c300779167cf595e019e61afcc0e20c4838692983a921db9006ac",
+ "sha256:619f9bf37cdb4c3dc9d4120d2a1003f5db9446f3618a323219f408f6a9df6725",
+ "sha256:660b91eca10ee1b44c47843894abe3e6cfd80e50c90dee3123befbf7ca486bd3",
+ "sha256:67710d220af405f5ce22712fa741d85e8b3ada7a457ea419b038469ba379837c",
+ "sha256:6caed86cd47ae93915d9031dc04be5283c275e1a2af2ceff33932071f3eeff4d",
+ "sha256:6d2182e648e79213b3881998b30225b3f4b1f3e681f1c1eaf4cacf19bde1040d",
+ "sha256:72758c9f785831d9d744af282d54c3e0f9db34f7eae521c33798695464993da2",
+ "sha256:74c2637d12eaacb503b0b06efdf55199a11b1d7c580bd3dd9dfe84cac97ef2f6",
+ "sha256:755020d5aeb1b10bffd93d119e7709a2a7475b6ad79c8d5226cea3f76d152ce0",
+ "sha256:7ccc4727300f223184520a6064c161a90b5d0283accd72d1455bcd85ec44dd0d",
+ "sha256:81ab21d03e3b0351847a86a0b298b297fde1e152752614138021d6d16a476ea6",
+ "sha256:8371217dff635cfc0220db2720fc3ce728cd47e72bb7572cca035332823dbdfc",
+ "sha256:876567136b0359f6581ecd892bdb4ca03a0eead0265db73206c78cff03bcdb0f",
+ "sha256:879411d04068bd489db57dcf6b82ffad3c5fb2a1fdd30817c566d8b7bedee442",
+ "sha256:898500957ae5e7f31b7271ace4e6f3625b38c0ac84e8cedde8de3a77a7fdae5e",
+ "sha256:8c9ca56345b0c5574db47560603de9d05f63cce5dfeb3a456eb60f3fec737ff2",
+ "sha256:8ec2c146e10b59c376b6bc0369929647fcd95404a503a7aa0990f21c16462248",
+ "sha256:8f7c68de4f362c1b2f426395fe4e05028c56d0782b2ec3ae18a5416eaf775576",
+ "sha256:909bdd4e19ea437eb9b45d6695d722f6f0fd9d8f493e837d70f92062b9f39faf",
+ "sha256:9d97c713433087ba5cee61a3e8edb54029753d45a4288ad61a176fa4718033ce",
+ "sha256:a65e0119ad39e855427520f7829618f78eb2824aa05e63ff19b466080cd99210",
+ "sha256:aa9087571729c968cd853d54b3f6e9d0ec61e45cd2c31e0eb8a0d4bdbbe6da2f",
+ "sha256:aef0889417eda2db000d791f9739f5cecb9ccdd45c98f82c6be531bdc67ff0f2",
+ "sha256:b253d0c53c8ee12c3e53d181fb9ef6ce2cd9c41cbca1c56a535e4fc8ec41e241",
+ "sha256:b80f6f6478f9d4ca26daee6c61584499493bf97950cfaa1a02b16bb5c2c17e70",
+ "sha256:be6329b5ba18ec5d32dc26181e0148e423347ed936dda48bf49fb243895d1566",
+ "sha256:c7560f622e3849cc8f3e999791a915addd08fafe80b47fcf3ffbda5b5151047c",
+ "sha256:d1a7a716bb04b1c3c4a707e38e2dee46ac544fff931e66d7ae944f3019fc55b8",
+ "sha256:d63b04e16df8ea21dfcedbf5a60e11cbba9d835d44cb3cbff233cfd037a916d5",
+ "sha256:d777d239036815e9b3a093fa9208ad314c040c26d7246617e70e23025b60083a",
+ "sha256:e892d3177380ec080550b56a7ffeab680af25575d291766bdd875147ba246a91",
+ "sha256:e9c90a44470f2999779057aeaf33461cbd8bb59d8f15e983150d10bb260e16e0",
+ "sha256:f097dda5d4f9b9b01b3c9fa2069f9c02929365f48f341feddf3d6b32510a2f93",
+ "sha256:f4ebfe03cbae821ef994b2e58e4df6a087470cc522aca502614e82a143365d45"
],
"index": "ia",
- "version": "==0.14.1"
+ "version": "==0.19.0"
}
},
"develop": {
"astroid": {
"hashes": [
- "sha256:2f4078c2a41bf377eea06d71c9d2ba4eb8f6b1af2135bec27bbbb7d8f12bb703",
- "sha256:bc58d83eb610252fd8de6363e39d4f1d0619c894b0ed24603b881c02e64c7386"
+ "sha256:10e0ad5f7b79c435179d0d0f0df69998c4eef4597534aae44910db060baeb907",
+ "sha256:1493fe8bd3dfd73dc35bd53c9d5b6e49ead98497c47b2307662556a5692d29d7"
],
- "version": "==2.4.2"
+ "markers": "python_full_version >= '3.7.2'",
+ "version": "==2.12.13"
+ },
+ "asttokens": {
+ "hashes": [
+ "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3",
+ "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c"
+ ],
+ "version": "==2.2.1"
},
"attrs": {
"hashes": [
- "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
- "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
+ "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
+ "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"
],
- "version": "==20.3.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==22.2.0"
},
"backcall": {
"hashes": [
@@ -1130,96 +1443,146 @@
],
"version": "==0.2.0"
},
+ "black": {
+ "hashes": [
+ "sha256:101c69b23df9b44247bd88e1d7e90154336ac4992502d4197bdac35dd7ee3320",
+ "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351",
+ "sha256:1f58cbe16dfe8c12b7434e50ff889fa479072096d79f0a7f25e4ab8e94cd8350",
+ "sha256:229351e5a18ca30f447bf724d007f890f97e13af070bb6ad4c0a441cd7596a2f",
+ "sha256:436cc9167dd28040ad90d3b404aec22cedf24a6e4d7de221bec2730ec0c97bcf",
+ "sha256:559c7a1ba9a006226f09e4916060982fd27334ae1998e7a38b3f33a37f7a2148",
+ "sha256:7412e75863aa5c5411886804678b7d083c7c28421210180d67dfd8cf1221e1f4",
+ "sha256:77d86c9f3db9b1bf6761244bc0b3572a546f5fe37917a044e02f3166d5aafa7d",
+ "sha256:82d9fe8fee3401e02e79767016b4907820a7dc28d70d137eb397b92ef3cc5bfc",
+ "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d",
+ "sha256:c116eed0efb9ff870ded8b62fe9f28dd61ef6e9ddd28d83d7d264a38417dcee2",
+ "sha256:d30b212bffeb1e252b31dd269dfae69dd17e06d92b87ad26e23890f3efea366f"
+ ],
+ "index": "ia",
+ "version": "==22.12.0"
+ },
"certifi": {
"hashes": [
- "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
- "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2020.12.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
- "chardet": {
+ "charset-normalizer": {
"hashes": [
- "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
- "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==4.0.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
- "coverage": {
+ "click": {
"hashes": [
- "sha256:08b3ba72bd981531fd557f67beee376d6700fba183b167857038997ba30dd297",
- "sha256:2757fa64e11ec12220968f65d086b7a29b6583d16e9a544c889b22ba98555ef1",
- "sha256:3102bb2c206700a7d28181dbe04d66b30780cde1d1c02c5f3c165cf3d2489497",
- "sha256:3498b27d8236057def41de3585f317abae235dd3a11d33e01736ffedb2ef8606",
- "sha256:378ac77af41350a8c6b8801a66021b52da8a05fd77e578b7380e876c0ce4f528",
- "sha256:38f16b1317b8dd82df67ed5daa5f5e7c959e46579840d77a67a4ceb9cef0a50b",
- "sha256:3911c2ef96e5ddc748a3c8b4702c61986628bb719b8378bf1e4a6184bbd48fe4",
- "sha256:3a3c3f8863255f3c31db3889f8055989527173ef6192a283eb6f4db3c579d830",
- "sha256:3b14b1da110ea50c8bcbadc3b82c3933974dbeea1832e814aab93ca1163cd4c1",
- "sha256:535dc1e6e68fad5355f9984d5637c33badbdc987b0c0d303ee95a6c979c9516f",
- "sha256:6f61319e33222591f885c598e3e24f6a4be3533c1d70c19e0dc59e83a71ce27d",
- "sha256:723d22d324e7997a651478e9c5a3120a0ecbc9a7e94071f7e1954562a8806cf3",
- "sha256:76b2775dda7e78680d688daabcb485dc87cf5e3184a0b3e012e1d40e38527cc8",
- "sha256:782a5c7df9f91979a7a21792e09b34a658058896628217ae6362088b123c8500",
- "sha256:7e4d159021c2029b958b2363abec4a11db0ce8cd43abb0d9ce44284cb97217e7",
- "sha256:8dacc4073c359f40fcf73aede8428c35f84639baad7e1b46fce5ab7a8a7be4bb",
- "sha256:8f33d1156241c43755137288dea619105477961cfa7e47f48dbf96bc2c30720b",
- "sha256:8ffd4b204d7de77b5dd558cdff986a8274796a1e57813ed005b33fd97e29f059",
- "sha256:93a280c9eb736a0dcca19296f3c30c720cb41a71b1f9e617f341f0a8e791a69b",
- "sha256:9a4f66259bdd6964d8cf26142733c81fb562252db74ea367d9beb4f815478e72",
- "sha256:9a9d4ff06804920388aab69c5ea8a77525cf165356db70131616acd269e19b36",
- "sha256:a2070c5affdb3a5e751f24208c5c4f3d5f008fa04d28731416e023c93b275277",
- "sha256:a4857f7e2bc6921dbd487c5c88b84f5633de3e7d416c4dc0bb70256775551a6c",
- "sha256:a607ae05b6c96057ba86c811d9c43423f35e03874ffb03fbdcd45e0637e8b631",
- "sha256:a66ca3bdf21c653e47f726ca57f46ba7fc1f260ad99ba783acc3e58e3ebdb9ff",
- "sha256:ab110c48bc3d97b4d19af41865e14531f300b482da21783fdaacd159251890e8",
- "sha256:b239711e774c8eb910e9b1ac719f02f5ae4bf35fa0420f438cdc3a7e4e7dd6ec",
- "sha256:be0416074d7f253865bb67630cf7210cbc14eb05f4099cc0f82430135aaa7a3b",
- "sha256:c46643970dff9f5c976c6512fd35768c4a3819f01f61169d8cdac3f9290903b7",
- "sha256:c5ec71fd4a43b6d84ddb88c1df94572479d9a26ef3f150cef3dacefecf888105",
- "sha256:c6e5174f8ca585755988bc278c8bb5d02d9dc2e971591ef4a1baabdf2d99589b",
- "sha256:c89b558f8a9a5a6f2cfc923c304d49f0ce629c3bd85cb442ca258ec20366394c",
- "sha256:cc44e3545d908ecf3e5773266c487ad1877be718d9dc65fc7eb6e7d14960985b",
- "sha256:cc6f8246e74dd210d7e2b56c76ceaba1cc52b025cd75dbe96eb48791e0250e98",
- "sha256:cd556c79ad665faeae28020a0ab3bda6cd47d94bec48e36970719b0b86e4dcf4",
- "sha256:ce6f3a147b4b1a8b09aae48517ae91139b1b010c5f36423fa2b866a8b23df879",
- "sha256:ceb499d2b3d1d7b7ba23abe8bf26df5f06ba8c71127f188333dddcf356b4b63f",
- "sha256:cef06fb382557f66d81d804230c11ab292d94b840b3cb7bf4450778377b592f4",
- "sha256:e448f56cfeae7b1b3b5bcd99bb377cde7c4eb1970a525c770720a352bc4c8044",
- "sha256:e52d3d95df81c8f6b2a1685aabffadf2d2d9ad97203a40f8d61e51b70f191e4e",
- "sha256:ee2f1d1c223c3d2c24e3afbb2dd38be3f03b1a8d6a83ee3d9eb8c36a52bee899",
- "sha256:f2c6888eada180814b8583c3e793f3f343a692fc802546eed45f40a001b1169f",
- "sha256:f51dbba78d68a44e99d484ca8c8f604f17e957c1ca09c3ebc2c7e3bbd9ba0448",
- "sha256:f54de00baf200b4539a5a092a759f000b5f45fd226d6d25a76b0dff71177a714",
- "sha256:fa10fee7e32213f5c7b0d6428ea92e3a3fdd6d725590238a3f92c0de1c78b9d2",
- "sha256:fabeeb121735d47d8eab8671b6b031ce08514c86b7ad8f7d5490a7b6dcd6267d",
- "sha256:fac3c432851038b3e6afe086f777732bcf7f6ebbfd90951fa04ee53db6d0bcdd",
- "sha256:fda29412a66099af6d6de0baa6bd7c52674de177ec2ad2630ca264142d69c6c7",
- "sha256:ff1330e8bc996570221b450e2d539134baa9465f5cb98aff0e0f73f34172e0ae"
+ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
+ "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
],
- "version": "==5.3.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.3"
+ },
+ "coverage": {
+ "extras": [
+ "toml"
+ ],
+ "hashes": [
+ "sha256:07bcfb1d8ac94af886b54e18a88b393f6a73d5959bb31e46644a02453c36e475",
+ "sha256:09f6b5a8415b6b3e136d5fec62b552972187265cb705097bf030eb9d4ffb9b60",
+ "sha256:0a79137fc99815fff6a852c233628e735ec15903cfd16da0f229d9c4d45926ab",
+ "sha256:0b4b3a4d9915b2be879aff6299c0a6129f3d08a775d5a061f503cf79571f73e4",
+ "sha256:1285648428a6101b5f41a18991c84f1c3959cee359e51b8375c5882fc364a13f",
+ "sha256:12a5aa77783d49e05439fbe6e6b427484f8a0f9f456b46a51d8aac022cfd024d",
+ "sha256:19ec666533f0f70a0993f88b8273057b96c07b9d26457b41863ccd021a043b9a",
+ "sha256:1e414dc32ee5c3f36544ea466b6f52f28a7af788653744b8570d0bf12ff34bc0",
+ "sha256:2c44fcfb3781b41409d0f060a4ed748537557de9362a8a9282182fafb7a76ab4",
+ "sha256:397b4a923cc7566bbc7ae2dfd0ba5a039b61d19c740f1373791f2ebd11caea59",
+ "sha256:3cfc595d2af13856505631be072835c59f1acf30028d1c860b435c5fc9c15b69",
+ "sha256:3dd4ee135e08037f458425b8842d24a95a0961831a33f89685ff86b77d378f89",
+ "sha256:486ee81fa694b4b796fc5617e376326a088f7b9729c74d9defa211813f3861e4",
+ "sha256:4f943a3b2bc520102dd3e0bb465e1286e12c9a54f58accd71b9e65324d9c7c01",
+ "sha256:63d56165a7c76265468d7e0c5548215a5ba515fc2cba5232d17df97bffa10f6c",
+ "sha256:66b18c3cf8bbab0cce0d7b9e4262dc830e93588986865a8c78ab2ae324b3ed56",
+ "sha256:691571f31ace1837838b7e421d3a09a8c00b4aac32efacb4fc9bd0a5c647d25a",
+ "sha256:6c5ad996c6fa4d8ed669cfa1e8551348729d008a2caf81489ab9ea67cfbc7498",
+ "sha256:6d55d840e1b8c0002fce66443e124e8581f30f9ead2e54fbf6709fb593181f2c",
+ "sha256:72d1507f152abacea81f65fee38e4ef3ac3c02ff8bc16f21d935fd3a8a4ad910",
+ "sha256:74f70cd92669394eaf8d7756d1b195c8032cf7bbbdfce3bc489d4e15b3b8cf73",
+ "sha256:830525361249dc4cd013652b0efad645a385707a5ae49350c894b67d23fbb07c",
+ "sha256:854f22fa361d1ff914c7efa347398374cc7d567bdafa48ac3aa22334650dfba2",
+ "sha256:89caf4425fe88889e2973a8e9a3f6f5f9bbe5dd411d7d521e86428c08a873a4a",
+ "sha256:9158f8fb06747ac17bd237930c4372336edc85b6e13bdc778e60f9d685c3ca37",
+ "sha256:92651580bd46519067e36493acb394ea0607b55b45bd81dd4e26379ed1871f55",
+ "sha256:978258fec36c154b5e250d356c59af7d4c3ba02bef4b99cda90b6029441d797d",
+ "sha256:9823e4789ab70f3ec88724bba1a203f2856331986cd893dedbe3e23a6cfc1e4e",
+ "sha256:9b373c9345c584bb4b5f5b8840df7f4ab48c4cbb7934b58d52c57020d911b856",
+ "sha256:a4a574a19eeb67575a5328a5760bbbb737faa685616586a9f9da4281f940109c",
+ "sha256:aec2d1515d9d39ff270059fd3afbb3b44e6ec5758af73caf18991807138c7118",
+ "sha256:b3695c4f4750bca943b3e1f74ad4be8d29e4aeab927d50772c41359107bd5d5c",
+ "sha256:b3763e7fcade2ff6c8e62340af9277f54336920489ceb6a8cd6cc96da52fcc62",
+ "sha256:b66bb21a23680dee0be66557dc6b02a3152ddb55edf9f6723fa4a93368f7158d",
+ "sha256:b6f22bb64cc39bcb883e5910f99a27b200fdc14cdd79df8696fa96b0005c9444",
+ "sha256:b77015d1cb8fe941be1222a5a8b4e3fbca88180cfa7e2d4a4e58aeabadef0ab7",
+ "sha256:b9ea158775c7c2d3e54530a92da79496fb3fb577c876eec761c23e028f1e216c",
+ "sha256:c20cfebcc149a4c212f6491a5f9ff56f41829cd4f607b5be71bb2d530ef243b1",
+ "sha256:cfded268092a84605f1cc19e5c737f9ce630a8900a3589e9289622db161967e9",
+ "sha256:d1991f1dd95eba69d2cd7708ff6c2bbd2426160ffc73c2b81f617a053ebcb1a8",
+ "sha256:d3022c3007d3267a880b5adcf18c2a9bf1fc64469b394a804886b401959b8742",
+ "sha256:d6814854c02cbcd9c873c0f3286a02e3ac1250625cca822ca6bc1018c5b19f1c",
+ "sha256:d87717959d4d0ee9db08a0f1d80d21eb585aafe30f9b0a54ecf779a69cb015f6",
+ "sha256:e00c14720b8b3b6c23b487e70bd406abafc976ddc50490f645166f111c419c39",
+ "sha256:e60bef2e2416f15fdc05772bf87db06c6a6f9870d1db08fdd019fbec98ae24a9",
+ "sha256:e78e9dcbf4f3853d3ae18a8f9272111242531535ec9e1009fa8ec4a2b74557dc",
+ "sha256:f66460f17c9319ea4f91c165d46840314f0a7c004720b20be58594d162a441d8",
+ "sha256:fa6a5a224b7f4cfb226f4fc55a57e8537fcc096f42219128c2c74c0e7d0953e1",
+ "sha256:fb992c47cb1e5bd6a01e97182400bcc2ba2077080a17fcd7be23aaa6e572e390",
+ "sha256:fd1b9c5adc066db699ccf7fa839189a649afcdd9e02cb5dc9d24e67e7922737d",
+ "sha256:fd556ff16a57a070ce4f31c635953cc44e25244f91a0378c6e9bdfd40fdb249f"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==7.0.1"
},
"decorator": {
"hashes": [
- "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
- "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
+ },
+ "dill": {
+ "hashes": [
+ "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0",
+ "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"
],
- "version": "==4.4.2"
+ "markers": "python_version < '3.11'",
+ "version": "==0.3.6"
+ },
+ "executing": {
+ "hashes": [
+ "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc",
+ "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"
+ ],
+ "version": "==1.2.0"
},
"flake8": {
"hashes": [
- "sha256:749dbbd6bfd0cf1318af27bf97a14e28e5ff548ef8e5b1566ccfb25a11e7c839",
- "sha256:aadae8761ec651813c24be05c6f7b4680857ef6afaae4651a4eccaef97ce6c3b"
+ "sha256:3833794e27ff64ea4e9cf5d410082a8b97ff1a06c16aa3d2027339cd0f1195c7",
+ "sha256:c61007e76655af75e6785a931f452915b371dc48f56efd765247c8fe68f2b181"
],
"index": "ia",
- "version": "==3.8.4"
+ "version": "==6.0.0"
},
"flake8-annotations": {
"hashes": [
- "sha256:0bcebb0792f1f96d617ded674dca7bf64181870bfe5dace353a1483551f8e5f1",
- "sha256:bebd11a850f6987a943ce8cdff4159767e0f5f89b3c88aca64680c2175ee02df"
+ "sha256:11f09efb99ae63c8f9d6b492b75fe147fbc323179fddfe00b2e56eefeca42f57",
+ "sha256:a4385158a7a9fc8af1d8820a2f4c8d03387997006a83f5f8bfe5bc6085bdf88a"
],
"index": "ia",
- "version": "==2.4.1"
+ "version": "==2.9.1"
},
"idna": {
"hashes": [
@@ -1237,85 +1600,104 @@
},
"ipython": {
"hashes": [
- "sha256:c987e8178ced651532b3b1ff9965925bfd445c279239697052561a9ab806d28f",
- "sha256:cbb2ef3d5961d44e6a963b9817d4ea4e1fa2eb589c371a470fed14d8d40cbd6a"
+ "sha256:352042ddcb019f7c04e48171b4dd78e4c4bb67bf97030d170e154aac42b656d9",
+ "sha256:882899fe78d5417a0aa07f995db298fa28b58faeba2112d2e3a4c95fe14bb738"
],
"index": "ia",
- "version": "==7.19.0"
- },
- "ipython-genutils": {
- "hashes": [
- "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
- "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
- ],
- "version": "==0.2.0"
+ "version": "==8.7.0"
},
"isort": {
"hashes": [
- "sha256:dcab1d98b469a12a1a624ead220584391648790275560e1a43e54c5dceae65e7",
- "sha256:dcaeec1b5f0eca77faea2a35ab790b4f3680ff75590bfcb7145986905aab2f58"
+ "sha256:6db30c5ded9815d813932c04c2f85a360bcdd35fed496f4d8f35495ef0a261b6",
+ "sha256:c033fd0edb91000a7f09527fe5c75321878f98322a77ddcc81adbd83724afb7b"
],
- "version": "==5.6.4"
+ "index": "ia",
+ "version": "==5.11.4"
},
"jedi": {
"hashes": [
- "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93",
- "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"
+ "sha256:203c1fd9d969ab8f2119ec0a3342e0b49910045abe6af0a3ae83a5764d54639e",
+ "sha256:bae794c30d07f6d910d32a7048af09b5a39ed740918da923c6b780790ebac612"
],
- "version": "==0.18.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.18.2"
},
"lazy-object-proxy": {
"hashes": [
- "sha256:0c4b206227a8097f05c4dbdd323c50edf81f15db3b8dc064d08c62d37e1a504d",
- "sha256:194d092e6f246b906e8f70884e620e459fc54db3259e60cf69a4d66c3fda3449",
- "sha256:1be7e4c9f96948003609aa6c974ae59830a6baecc5376c25c92d7d697e684c08",
- "sha256:4677f594e474c91da97f489fea5b7daa17b5517190899cf213697e48d3902f5a",
- "sha256:48dab84ebd4831077b150572aec802f303117c8cc5c871e182447281ebf3ac50",
- "sha256:5541cada25cd173702dbd99f8e22434105456314462326f06dba3e180f203dfd",
- "sha256:59f79fef100b09564bc2df42ea2d8d21a64fdcda64979c0fa3db7bdaabaf6239",
- "sha256:8d859b89baf8ef7f8bc6b00aa20316483d67f0b1cbf422f5b4dc56701c8f2ffb",
- "sha256:9254f4358b9b541e3441b007a0ea0764b9d056afdeafc1a5569eee1cc6c1b9ea",
- "sha256:9651375199045a358eb6741df3e02a651e0330be090b3bc79f6d0de31a80ec3e",
- "sha256:97bb5884f6f1cdce0099f86b907aa41c970c3c672ac8b9c8352789e103cf3156",
- "sha256:9b15f3f4c0f35727d3a0fba4b770b3c4ebbb1fa907dbcc046a1d2799f3edd142",
- "sha256:a2238e9d1bb71a56cd710611a1614d1194dc10a175c1e08d75e1a7bcc250d442",
- "sha256:a6ae12d08c0bf9909ce12385803a543bfe99b95fe01e752536a60af2b7797c62",
- "sha256:ca0a928a3ddbc5725be2dd1cf895ec0a254798915fb3a36af0964a0a4149e3db",
- "sha256:cb2c7c57005a6804ab66f106ceb8482da55f5314b7fcb06551db1edae4ad1531",
- "sha256:d74bb8693bf9cf75ac3b47a54d716bbb1a92648d5f781fc799347cfc95952383",
- "sha256:d945239a5639b3ff35b70a88c5f2f491913eb94871780ebfabb2568bd58afc5a",
- "sha256:eba7011090323c1dadf18b3b689845fd96a61ba0a1dfbd7f24b921398affc357",
- "sha256:efa1909120ce98bbb3777e8b6f92237f5d5c8ea6758efea36a473e1d38f7d3e4",
- "sha256:f3900e8a5de27447acbf900b4750b0ddfd7ec1ea7fbaf11dfa911141bc522af0"
- ],
- "version": "==1.4.3"
+ "sha256:0c1c7c0433154bb7c54185714c6929acc0ba04ee1b167314a779b9025517eada",
+ "sha256:14010b49a2f56ec4943b6cf925f597b534ee2fe1f0738c84b3bce0c1a11ff10d",
+ "sha256:4e2d9f764f1befd8bdc97673261b8bb888764dfdbd7a4d8f55e4fbcabb8c3fb7",
+ "sha256:4fd031589121ad46e293629b39604031d354043bb5cdf83da4e93c2d7f3389fe",
+ "sha256:5b51d6f3bfeb289dfd4e95de2ecd464cd51982fe6f00e2be1d0bf94864d58acd",
+ "sha256:6850e4aeca6d0df35bb06e05c8b934ff7c533734eb51d0ceb2d63696f1e6030c",
+ "sha256:6f593f26c470a379cf7f5bc6db6b5f1722353e7bf937b8d0d0b3fba911998858",
+ "sha256:71d9ae8a82203511a6f60ca5a1b9f8ad201cac0fc75038b2dc5fa519589c9288",
+ "sha256:7e1561626c49cb394268edd00501b289053a652ed762c58e1081224c8d881cec",
+ "sha256:8f6ce2118a90efa7f62dd38c7dbfffd42f468b180287b748626293bf12ed468f",
+ "sha256:ae032743794fba4d171b5b67310d69176287b5bf82a21f588282406a79498891",
+ "sha256:afcaa24e48bb23b3be31e329deb3f1858f1f1df86aea3d70cb5c8578bfe5261c",
+ "sha256:b70d6e7a332eb0217e7872a73926ad4fdc14f846e85ad6749ad111084e76df25",
+ "sha256:c219a00245af0f6fa4e95901ed28044544f50152840c5b6a3e7b2568db34d156",
+ "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8",
+ "sha256:d176f392dbbdaacccf15919c77f526edf11a34aece58b55ab58539807b85436f",
+ "sha256:e20bfa6db17a39c706d24f82df8352488d2943a3b7ce7d4c22579cb89ca8896e",
+ "sha256:eac3a9a5ef13b332c059772fd40b4b1c3d45a3a2b05e33a361dee48e54a4dad0",
+ "sha256:eb329f8d8145379bf5dbe722182410fe8863d186e51bf034d2075eb8d85ee25b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.8.0"
+ },
+ "matplotlib-inline": {
+ "hashes": [
+ "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311",
+ "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==0.1.6"
},
"mccabe": {
"hashes": [
- "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
- "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+ "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
],
- "version": "==0.6.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.7.0"
},
"mypy": {
"hashes": [
- "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324",
- "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc",
- "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802",
- "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122",
- "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975",
- "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7",
- "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666",
- "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669",
- "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178",
- "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01",
- "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea",
- "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de",
- "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1",
- "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c"
+ "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d",
+ "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6",
+ "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf",
+ "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f",
+ "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813",
+ "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33",
+ "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad",
+ "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05",
+ "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297",
+ "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06",
+ "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd",
+ "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243",
+ "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305",
+ "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476",
+ "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711",
+ "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70",
+ "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5",
+ "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461",
+ "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab",
+ "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c",
+ "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d",
+ "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135",
+ "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93",
+ "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648",
+ "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a",
+ "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb",
+ "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3",
+ "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372",
+ "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb",
+ "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"
],
"index": "ia",
- "version": "==0.790"
+ "version": "==0.991"
},
"mypy-extensions": {
"hashes": [
@@ -1326,17 +1708,27 @@
},
"packaging": {
"hashes": [
- "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858",
- "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"
+ "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+ "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
],
- "version": "==20.8"
+ "markers": "python_version >= '3.7'",
+ "version": "==22.0"
},
"parso": {
"hashes": [
- "sha256:15b00182f472319383252c18d5913b69269590616c947747bc50bf4ac768f410",
- "sha256:8519430ad07087d4c997fda3a7918f7cfa27cb58972a8c89c2a0295a1c940e9e"
+ "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0",
+ "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"
],
- "version": "==0.8.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
+ },
+ "pathspec": {
+ "hashes": [
+ "sha256:3c95343af8b756205e2aba76e843ba9520a24dd84f68c22b9f93251507509dd6",
+ "sha256:56200de4077d9d0791465aa9095a01d421861e405b5096955051deefd697d6f6"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.10.3"
},
"pexpect": {
"hashes": [
@@ -1353,19 +1745,29 @@
],
"version": "==0.7.5"
},
+ "platformdirs": {
+ "hashes": [
+ "sha256:1a89a12377800c81983db6be069ec068eee989748799b946cce2a6e80dcc54ca",
+ "sha256:b46ffafa316e6b83b47489d240ce17173f123a9b9c83282141c3daf26ad9ac2e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.6.0"
+ },
"pluggy": {
"hashes": [
- "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
- "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
+ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+ "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
- "version": "==0.13.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.0.0"
},
"prompt-toolkit": {
"hashes": [
- "sha256:25c95d2ac813909f813c93fde734b6e44406d1477a9faef7c915ff37d39c0a8c",
- "sha256:7debb9a521e0b1ee7d2fe96ee4bd60ef03c6492784de0547337ca4433e46aa63"
+ "sha256:3e163f254bef5a03b146397d7c1963bd3e2812f0964bb9a24e6ec761fd28db63",
+ "sha256:aa64ad242a462c5ff0363a7b9cfe696c20d55d9fc60c11fd8e632d064804d305"
],
- "version": "==3.0.8"
+ "markers": "python_full_version >= '3.6.2'",
+ "version": "==3.0.36"
},
"ptyprocess": {
"hashes": [
@@ -1374,175 +1776,224 @@
],
"version": "==0.7.0"
},
+ "pure-eval": {
+ "hashes": [
+ "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350",
+ "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"
+ ],
+ "version": "==0.2.2"
+ },
"py": {
"hashes": [
- "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
- "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+ "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
+ "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
- "version": "==1.10.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.11.0"
},
"pycodestyle": {
"hashes": [
- "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367",
- "sha256:c58a7d2815e0e8d7972bf1803331fb0152f867bd89adf8a01dfd55085434192e"
+ "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053",
+ "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"
],
- "version": "==2.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.10.0"
},
"pyflakes": {
"hashes": [
- "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92",
- "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"
+ "sha256:ec55bf7fe21fff7f1ad2f7da62363d749e2a470500eab1b555334b67aa1ef8cf",
+ "sha256:ec8b276a6b60bd80defed25add7e439881c19e64850afd9b346283d4165fd0fd"
],
- "version": "==2.2.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.1"
},
"pygments": {
"hashes": [
- "sha256:ccf3acacf3782cbed4a989426012f1c535c9a90d3a7fc3f16d231b9372d2b716",
- "sha256:f275b6c0909e5dafd2d6269a656aa90fa58ebf4a74f8fcf9053195d226b24a08"
+ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
+ "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "version": "==2.7.3"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.13.0"
},
"pylint": {
"hashes": [
- "sha256:bb4a908c9dadbc3aac18860550e870f58e1a02c9f2c204fdf5693d73be061210",
- "sha256:bfe68f020f8a0fece830a22dd4d5dddb4ecc6137db04face4c3420a46a52239f"
+ "sha256:18783cca3cfee5b83c6c5d10b3cdb66c6594520ffae61890858fe8d932e1c6b4",
+ "sha256:349c8cd36aede4d50a0754a8c0218b43323d13d5d88f4b2952ddfe3e169681eb"
],
"index": "ia",
- "version": "==2.6.0"
- },
- "pyparsing": {
- "hashes": [
- "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
- "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
- ],
- "version": "==2.4.7"
+ "version": "==2.15.9"
},
"pytest": {
"hashes": [
- "sha256:1969f797a1a0dbd8ccf0fecc80262312729afea9c17f1d70ebf85c5e76c6f7c8",
- "sha256:66e419b1899bc27346cb2c993e12c5e5e8daba9073c1fbce33b9807abc95c306"
+ "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+ "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
],
"index": "ia",
- "version": "==6.2.1"
+ "version": "==6.2.5"
},
"pytest-cov": {
"hashes": [
- "sha256:45ec2d5182f89a81fc3eb29e3d1ed3113b9e9a873bcddb2a71faaab066110191",
- "sha256:47bd0ce14056fdd79f93e1713f88fad7bdcc583dcd7783da86ef2f085a0bb88e"
+ "sha256:2feb1b751d66a8bd934e5edfa2e961d11309dc37b73b0eabe73b5945fee20f6b",
+ "sha256:996b79efde6433cdbd0088872dbc5fb3ed7fe1578b68cdbba634f14bb8dd0470"
],
"index": "ia",
- "version": "==2.10.1"
+ "version": "==4.0.0"
},
"pytest-mock": {
"hashes": [
- "sha256:c0fc979afac4aaba545cbd01e9c20736eb3fefb0a066558764b07d3de8f04ed3",
- "sha256:c3981f5edee6c4d1942250a60d9b39d38d5585398de1bfce057f925bdda720f4"
+ "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b",
+ "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"
],
"index": "ia",
- "version": "==3.4.0"
+ "version": "==3.10.0"
},
"pytest-pylint": {
"hashes": [
- "sha256:790c7a8019fab08e59bd3812db1657a01995a975af8b1c6ce95b9aa39d61da27",
- "sha256:b63aaf8b80ff33c8ceaa7f68323ed04102c7790093ccf6bdb261a4c2dc6fd564"
+ "sha256:b51d3f93bed9c192e2b046f16520981bee5abe7bd61b070306e7ee685219fdd3",
+ "sha256:d88e83c1023c641548a9ec3567707ceee7616632a986af133426d4a74d066932"
],
"index": "ia",
- "version": "==0.18.0"
+ "version": "==0.19.0"
},
"pytest-pythonpath": {
"hashes": [
- "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ "sha256:64e195b23a8f8c0c631fb16882d9ad6fa4137ed1f2961ddd15d52065cd435db6",
+ "sha256:e73e11dab2f0b83e73229e261242b251f0a369d7f527dbfec068822fd26a6ce5"
],
"index": "ia",
- "version": "==0.7.3"
+ "version": "==0.7.4"
},
"requests": {
"hashes": [
- "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
- "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.25.1"
+ "version": "==2.28.1"
},
"responses": {
"hashes": [
- "sha256:2e5764325c6b624e42b428688f2111fea166af46623cb0127c05f6afb14d3457",
- "sha256:ef265bd3200bdef5ec17912fc64a23570ba23597fd54ca75c18650fa1699213d"
+ "sha256:396acb2a13d25297789a5866b4881cf4e46ffd49cc26c43ab1117f40b973102e",
+ "sha256:dcf294d204d14c436fddcc74caefdbc5764795a40ff4e6a7740ed8ddbf3294be"
],
"index": "ia",
- "version": "==0.12.1"
+ "version": "==0.22.0"
},
"six": {
"hashes": [
- "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
- "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
+ },
+ "stack-data": {
+ "hashes": [
+ "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815",
+ "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"
],
- "version": "==1.15.0"
+ "version": "==0.6.2"
},
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
"version": "==0.10.2"
},
+ "tomli": {
+ "hashes": [
+ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+ "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+ ],
+ "version": "==2.0.1"
+ },
+ "tomlkit": {
+ "hashes": [
+ "sha256:07de26b0d8cfc18f871aec595fda24d95b08fef89d147caa861939f37230bf4b",
+ "sha256:71b952e5721688937fb02cf9d354dbcf0785066149d2855e44531ebdd2b65d73"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.11.6"
+ },
"traitlets": {
"hashes": [
- "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396",
- "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"
- ],
- "version": "==5.0.5"
- },
- "typed-ast": {
- "hashes": [
- "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
- "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
- "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
- "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
- "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
- "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
- "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
- "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
- "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
- "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
- "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
- "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
- "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
- "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
- "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
- "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
- "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
- "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
- "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
- "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
- "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
- "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
- "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
- "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
- "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
- "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
- "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
- "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
- "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
- "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
- ],
- "version": "==1.4.1"
+ "sha256:6cc57d6dc28c85d5365961726ffd19b538739347749e13ebe34e03323a0e8f84",
+ "sha256:c864831efa0ba6576d09b44884b34e41defc18c0d7e720b4a2d6698c842cab3e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==5.8.0"
+ },
+ "types-beautifulsoup4": {
+ "hashes": [
+ "sha256:c1f803367a2b07ad4fdac40ddbea557010dc4ddd1ee92d801f317eb02e2e3c72",
+ "sha256:d46be8f409ddccb6daaa9d118484185e70bcf552085c39c6d05b157cd1462e04"
+ ],
+ "index": "ia",
+ "version": "==4.11.6.1"
+ },
+ "types-dateparser": {
+ "hashes": [
+ "sha256:5b0c8845167981f68f090894aa371bddbd0371341b90c3f868ac9524cd0a6b69",
+ "sha256:65232f1b3a952476fb98b31ae0a4019efd32635981040149b97b161d5ce2b4da"
+ ],
+ "index": "ia",
+ "version": "==1.1.4.4"
+ },
+ "types-pillow": {
+ "hashes": [
+ "sha256:98b8484ff343676f6f7051682a6cfd26896e993e86b3ce9badfa0ec8750f5405",
+ "sha256:c18d466dc18550d96b8b4a279ff94f0cbad696825b5ad55466604f1daf5709de"
+ ],
+ "index": "ia",
+ "version": "==9.3.0.4"
+ },
+ "types-psycopg2": {
+ "hashes": [
+ "sha256:084558d6bc4b2cfa249b06be0fdd9a14a69d307bae5bb5809a2f14cfbaa7a23f",
+ "sha256:bff045579642ce00b4a3c8f2e401b7f96dfaa34939f10be64b0dd3b53feca57d"
+ ],
+ "index": "ia",
+ "version": "==2.9.21.2"
+ },
+ "types-requests": {
+ "hashes": [
+ "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3",
+ "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"
+ ],
+ "index": "ia",
+ "version": "==2.28.11.7"
+ },
+ "types-toml": {
+ "hashes": [
+ "sha256:171bdb3163d79a520560f24ba916a9fc9bff81659c5448a9fea89240923722be",
+ "sha256:b7b5c4977f96ab7b5ac06d8a6590d17c0bf252a96efc03b109c2711fb3e0eafd"
+ ],
+ "version": "==0.10.8.1"
+ },
+ "types-urllib3": {
+ "hashes": [
+ "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49",
+ "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"
+ ],
+ "version": "==1.26.25.4"
},
"typing-extensions": {
"hashes": [
- "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
- "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
- "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
],
- "version": "==3.7.4.3"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
},
"urllib3": {
"hashes": [
- "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
- "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version != '3.4'",
- "version": "==1.26.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"wcwidth": {
"hashes": [
@@ -1553,9 +2004,73 @@
},
"wrapt": {
"hashes": [
- "sha256:b62ffa81fb85f4332a4f609cab4ac40709470da05643a082ec1eb88e6d9b97d7"
- ],
- "version": "==1.12.1"
+ "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3",
+ "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b",
+ "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4",
+ "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2",
+ "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656",
+ "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3",
+ "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff",
+ "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310",
+ "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a",
+ "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57",
+ "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069",
+ "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383",
+ "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe",
+ "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87",
+ "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d",
+ "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b",
+ "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907",
+ "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f",
+ "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0",
+ "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28",
+ "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1",
+ "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853",
+ "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc",
+ "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3",
+ "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3",
+ "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164",
+ "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1",
+ "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c",
+ "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1",
+ "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7",
+ "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1",
+ "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320",
+ "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed",
+ "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1",
+ "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248",
+ "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c",
+ "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456",
+ "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77",
+ "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef",
+ "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1",
+ "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7",
+ "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86",
+ "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4",
+ "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d",
+ "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d",
+ "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8",
+ "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5",
+ "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471",
+ "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00",
+ "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68",
+ "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3",
+ "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d",
+ "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735",
+ "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d",
+ "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569",
+ "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7",
+ "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59",
+ "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5",
+ "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb",
+ "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b",
+ "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f",
+ "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462",
+ "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015",
+ "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==1.14.1"
}
}
}
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..4395f19
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,46 @@
+
+This directory contains `sandcrawler` python code for ingest pipelines, batch
+processing, PDF extraction, etc.
+
+
+## Development Quickstart
+
+As of December 2022, working with this code requires:
+
+- Python 3.8 (specifically, due to version specification in `pipenv`)
+- `pipenv` for python dependency management
+- generic and python-specific build tools (`pkg-config`, `python-dev`, etc)
+- poppler (PDF processing library)
+- libmagic
+- libsodium
+- access to IA internal packages (`devpi.us.archive.org`), specifically for
+ globalwayback and related packages
+
+In production and CI we use Ubuntu Focal (20.04). The CI script for this
+repository (`../.gitlab-ci.yml`) is the best place to look for a complete list
+of dependencies for both development and deployment. Note that our CI system
+runs from our cluster, which resolves the devpi access issue. For developer
+laptops, you may need `sshuttle` or something similar set up to do initial
+package pulls.
+
+It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when
+working with pipenv. You can include this in a `.env` file.
+
+There is a Makefile which helps with the basics. Eg:
+
+ # install deps using pipenv
+ make deps
+
+ # run python tests
+ make test
+
+ # run code formatting and lint checks
+ make fmt lint
+
+Sometimes when developing it is helpful to enter a shell with pipenv, eg:
+
+ pipenv shell
+
+Often when developing it is helpful (or necessary) to set environment
+variables. `pipenv shell` will read from `.env`, so you can copy and edit
+`example.env`, and it will be used in tests, `pipenv shell`, etc.
diff --git a/python/TODO b/python/TODO
deleted file mode 100644
index 58a463f..0000000
--- a/python/TODO
+++ /dev/null
@@ -1,7 +0,0 @@
-
-ingest crawler:
-- SPNv2 only
- - remove most SPNv1/v2 path selection
-- landing page + fulltext hops only (short recursion depth)
-- use wayback client library instead of requests to fetch content
-- https://pypi.org/project/ratelimit/
diff --git a/python/example.env b/python/example.env
index 5064c96..85af66c 100644
--- a/python/example.env
+++ b/python/example.env
@@ -5,3 +5,4 @@ IA_SECRET_KEY="dummy"
CDX_AUTH_TOKEN="dummy"
PETABOX_WEBDATA_SECRET="dummy"
SENTRY_DSN=""
+SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/"
diff --git a/python/grobid2json.py b/python/grobid2json.py
deleted file mode 100755
index a22d47d..0000000
--- a/python/grobid2json.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-NB: adapted to work as a library for PDF extraction. Will probably be
-re-written eventually to be correct, complete, and robust; this is just a
-first iteration.
-
-This script tries to extract everything from a GROBID TEI XML fulltext dump:
-
-- header metadata
-- affiliations
-- references (with context)
-- abstract
-- fulltext
-- tables, figures, equations
-
-A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
-
-- abstract
-- fulltext
-- tables, figures, equations
-
-Prints JSON to stdout, errors to stderr
-"""
-
-import io
-import json
-import argparse
-import xml.etree.ElementTree as ET
-from typing import List, Any, Dict, AnyStr, Optional
-
-xml_ns = "http://www.w3.org/XML/1998/namespace"
-ns = "http://www.tei-c.org/ns/1.0"
-
-
-def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
- if not elem:
- return []
- names = []
- for author in elem.findall(".//{%s}author" % ns):
- pn = author.find("./{%s}persName" % ns)
- if not pn:
- continue
- given_name = pn.findtext("./{%s}forename" % ns) or None
- surname = pn.findtext("./{%s}surname" % ns) or None
- full_name = " ".join(pn.itertext())
- obj: Dict[str, Any] = dict(name=full_name)
- if given_name:
- obj["given_name"] = given_name
- if surname:
- obj["surname"] = surname
- ae = author.find("./{%s}affiliation" % ns)
- if ae:
- affiliation: Dict[str, Any] = dict()
- for on in ae.findall("./{%s}orgName" % ns):
- on_type = on.get("type")
- if on_type:
- affiliation[on_type] = on.text
- addr_e = ae.find("./{%s}address" % ns)
- if addr_e:
- address = dict()
- for t in addr_e:
- address[t.tag.split("}")[-1]] = t.text
- if address:
- affiliation["address"] = address
- # affiliation['address'] = {
- # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
- # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
- # 'country': addr.findtext('./{%s}country' % ns) or None,
- # }
- obj["affiliation"] = affiliation
- names.append(obj)
- return names
-
-
-def journal_info(elem: ET.Element) -> Dict[str, Any]:
- journal = dict()
- journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- journal["publisher"] = elem.findtext(
- ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
- )
- if journal["publisher"] == "":
- journal["publisher"] = None
- journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
- journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
- journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- keys = list(journal.keys())
-
- # remove empty/null keys
- for k in keys:
- if not journal[k]:
- journal.pop(k)
- return journal
-
-
-def biblio_info(elem: ET.Element) -> Dict[str, Any]:
- ref: Dict[str, Any] = dict()
- ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
- # Title stuff is messy in references...
- ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- if other_title:
- if ref["title"]:
- ref["journal"] = other_title
- else:
- ref["journal"] = None
- ref["title"] = other_title
- ref["authors"] = all_authors(elem)
- ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
- if ref["publisher"] == "":
- ref["publisher"] = None
- date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref["date"] = (date is not None) and date.attrib.get("when")
- ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- el = elem.find(".//{%s}ptr[@target]" % ns)
- if el is not None:
- ref["url"] = el.attrib["target"]
- # Hand correction
- if ref["url"].endswith(".Lastaccessed"):
- ref["url"] = ref["url"].replace(".Lastaccessed", "")
- else:
- ref["url"] = None
- return ref
-
-
-def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
-
- if isinstance(content, str):
- tree = ET.parse(io.StringIO(content))
- elif isinstance(content, bytes):
- tree = ET.parse(io.BytesIO(content))
-
- info: Dict[str, Any] = dict()
-
- # print(content)
- # print(content.getvalue())
- tei = tree.getroot()
-
- header = tei.find(".//{%s}teiHeader" % ns)
- if header is None:
- raise ValueError("XML does not look like TEI format")
- application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0]
- info["grobid_version"] = application_tag.attrib["version"].strip()
- info["grobid_timestamp"] = application_tag.attrib["when"].strip()
- info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- info["authors"] = all_authors(
- header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
- )
- info["journal"] = journal_info(header)
- date = header.find('.//{%s}date[@type="published"]' % ns)
- info["date"] = (date is not None) and date.attrib.get("when")
- info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
- info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
- if info["doi"]:
- info["doi"] = info["doi"].lower()
-
- refs = []
- for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))):
- ref = biblio_info(bs)
- ref["index"] = i
- refs.append(ref)
- info["citations"] = refs
-
- text = tei.find(".//{%s}text" % (ns))
- # print(text.attrib)
- if text and text.attrib.get("{%s}lang" % xml_ns):
- info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang
-
- if encumbered:
- el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns))
- info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find(".//{%s}text/{%s}body" % (ns, ns))
- info["body"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
- info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
- info["annex"] = (el or None) and " ".join(el.itertext()).strip()
-
- # remove empty/null keys
- keys = list(info.keys())
- for k in keys:
- if not info[k]:
- info.pop(k)
- return info
-
-
-def main() -> None: # pragma no cover
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- description="GROBID TEI XML to JSON",
- usage="%(prog)s [options] <teifile>...",
- )
- parser.add_argument(
- "--no-encumbered",
- action="store_true",
- help="don't include ambiguously copyright encumbered fields (eg, abstract, body)",
- )
- parser.add_argument("teifiles", nargs="+")
-
- args = parser.parse_args()
-
- for filename in args.teifiles:
- content = open(filename, "r").read()
- print(
- json.dumps(
- teixml2json(content, encumbered=(not args.no_encumbered)),
- sort_keys=True,
- )
- )
-
-
-if __name__ == "__main__": # pragma no cover
- main()
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 2a1d8b5..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -1,21 +1,28 @@
#!/usr/bin/env python3
-
"""
These are generally for running one-off tasks from the command line. Output
might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
-import sys
-import json
import argparse
-import datetime
+import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
from sandcrawler import *
+from sandcrawler.grobid import CrossrefRefsWorker
+
+
+def run_single(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ resp = grobid_client.process_fulltext(blob=args.pdf_file.read())
+ resp["_metadata"] = grobid_client.metadata(resp)
+ print(json.dumps(resp, sort_keys=True))
def run_extract_json(args):
@@ -30,6 +37,7 @@ def run_extract_json(args):
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
@@ -40,7 +48,7 @@ def run_extract_cdx(args):
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
@@ -49,10 +57,11 @@ def run_extract_cdx(args):
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_extract_zipfile(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
if args.jobs > 1:
@@ -65,6 +74,7 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+
def run_transform(args):
grobid_client = GrobidClient()
for line in args.json_file:
@@ -74,62 +84,101 @@ def run_transform(args):
if args.metadata_only:
out = grobid_client.metadata(line)
else:
- out = teixml2json(line['tei_xml'])
+ tei_doc = parse_document_xml(line["tei_xml"])
+ out = tei_doc.to_legacy_dict()
if out:
- if 'source' in line:
- out['source'] = line['source']
+ if "source" in line:
+ out["source"] = line["source"]
print(json.dumps(out))
+def run_parse_crossref_refs(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ worker = CrossrefRefsWorker(grobid_client, sink=args.sink)
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json',
- help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
+ sub_single = subparsers.add_parser("single")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument(
+ "pdf_file",
+ help="path to PDF file to process",
+ type=argparse.FileType("rb"),
+ )
+
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does GROBID extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx',
- help="for each CDX line, fetches PDF and does GROBID extraction")
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does GROBID extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
- help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does GROBID extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
-
- sub_transform = subparsers.add_parser('transform')
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+
+ sub_parse_crossref_refs = subparsers.add_parser(
+ "parse-crossref-refs",
+ help="reads Crossref metadata records, parses any unstructured refs with GROBID",
+ )
+ sub_parse_crossref_refs.set_defaults(func=run_parse_crossref_refs)
+ sub_parse_crossref_refs.add_argument(
+ "json_file",
+ help="JSON-L file to process (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_transform = subparsers.add_parser("transform")
sub_transform.set_defaults(func=run_transform)
- sub_transform.add_argument('--metadata-only',
- action='store_true',
- help="Only pass through bibliographic metadata, not fulltext")
- sub_transform.add_argument('json_file',
+ sub_transform.add_argument(
+ "--metadata-only",
+ action="store_true",
+ help="Only pass through bibliographic metadata, not fulltext",
+ )
+ sub_transform.add_argument(
+ "json_file",
help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -140,10 +189,10 @@ def main():
if args.kafka_mode:
produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 20c65bb..493c9e7 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python3
-
"""
Input is IA item metadata JSON.
-Ouput is insertable fatcat "match" JSON
+Output is insertable fatcat "match" JSON
- md5
- sha1
@@ -22,87 +21,93 @@ When invoking import matched, be sure to:
--default-mimetype application/pdf
"""
-import sys
import json
+import sys
+from typing import Any, Dict, Optional
-def parse(obj):
- if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
- print('skip: test item', file=sys.stderr)
+
+def parse(obj: dict) -> Optional[Dict[str, Any]]:
+ if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"):
+ print("skip: test item", file=sys.stderr)
return None
extid_type = None
extid = None
- if obj['metadata']['identifier'].startswith('arxiv-'):
- extid_type = 'arxiv'
- extid = obj['metadata'].get('source')
+ if obj["metadata"]["identifier"].startswith("arxiv-"):
+ extid_type = "arxiv"
+ extid = obj["metadata"].get("source")
if not extid:
- print('skip: no source', file=sys.stderr)
+ print("skip: no source", file=sys.stderr)
return None
- assert extid.startswith('http://arxiv.org/abs/')
- extid = extid.replace('http://arxiv.org/abs/', '')
- #print(extid)
- assert '/' in extid or '.' in extid
- if not 'v' in extid or not extid[-1].isdigit():
- print('skip: non-versioned arxiv_id', file=sys.stderr)
+ assert extid.startswith("http://arxiv.org/abs/")
+ extid = extid.replace("http://arxiv.org/abs/", "")
+ # print(extid)
+ assert "/" in extid or "." in extid
+ if "v" not in extid or not extid[-1].isdigit():
+ print("skip: non-versioned arxiv_id", file=sys.stderr)
return None
- elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
- extid_type = 'doi'
- extid = obj['metadata']['identifier-doi']
+ elif obj["metadata"]["identifier"].startswith("paper-doi-10_"):
+ extid_type = "doi"
+ extid = obj["metadata"]["identifier-doi"]
assert extid.startswith("10.")
- elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
- extid_type = 'pmcid'
- extid = obj['metadata']['identifier'].replace('pubmed-', '')
+ elif obj["metadata"]["identifier"].startswith("pubmed-PMC"):
+ extid_type = "pmcid"
+ extid = obj["metadata"]["identifier"].replace("pubmed-", "")
assert extid.startswith("PMC")
int(extid[3:])
- elif obj['metadata']['identifier'].startswith('jstor-'):
- extid_type = 'jstor'
- extid = obj['metadata']['identifier'].replace('jstor-', '')
+ elif obj["metadata"]["identifier"].startswith("jstor-"):
+ extid_type = "jstor"
+ extid = obj["metadata"]["identifier"].replace("jstor-", "")
int(extid)
else:
raise NotImplementedError()
pdf_file = None
- for f in obj['files']:
- if f['source'] == "original" and "PDF" in f['format']:
+ for f in obj["files"]:
+ if f["source"] == "original" and "PDF" in f["format"]:
pdf_file = f
break
if not pdf_file:
- print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr)
- #for f in obj['files']:
+ print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr)
+ # for f in obj['files']:
# print(f['format'], file=sys.stderr)
return None
- assert pdf_file['name'].endswith('.pdf')
+ assert pdf_file["name"].endswith(".pdf")
match = {
- 'md5': pdf_file['md5'],
- 'sha1': pdf_file['sha1'],
- 'size': int(pdf_file['size']),
- 'mimetype': 'application/pdf',
- 'urls': [
+ "md5": pdf_file["md5"],
+ "sha1": pdf_file["sha1"],
+ "size": int(pdf_file["size"]),
+ "mimetype": "application/pdf",
+ "urls": [
"https://archive.org/download/{}/{}".format(
- obj['metadata']['identifier'],
- pdf_file['name']),
+ obj["metadata"]["identifier"], pdf_file["name"]
+ ),
],
- 'cdx': [],
- 'dois': [],
+ "cdx": [],
+ "dois": [],
}
- if extid_type == 'doi':
- match['dois'] = [extid,]
+ if extid_type == "doi":
+ match["dois"] = [
+ extid,
+ ]
else:
match[extid_type] = extid
return match
-def run():
+
+def run() -> None:
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
- if match:
+ if match is not None:
print(json.dumps(match, sort_keys=True))
-if __name__ == '__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/ingest_file.py b/python/ingest_file.py
deleted file mode 100755
index 20b6d67..0000000
--- a/python/ingest_file.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import argparse
-
-from http.server import HTTPServer
-from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker
-
-
-def run_single_ingest(args):
- request = dict(
- ingest_type=args.ingest_type,
- base_url=args.url,
- ext_ids=dict(doi=args.doi),
- fatcat=dict(release_ident=args.release_id),
- )
- if args.force_recrawl:
- request['force_recrawl'] = True
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- html_quick_mode=args.html_quick_mode,
- )
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
- return result
-
-def run_requests(args):
- # TODO: switch to using JsonLinePusher
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- html_quick_mode=args.html_quick_mode,
- )
- for l in args.json_file:
- request = json.loads(l.strip())
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
-
-def run_api(args):
- port = 8083
- print("Listening on localhost:{}".format(port))
- server = HTTPServer(('', port), IngestFileRequestHandler)
- server.serve_forever()
-
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- subparsers = parser.add_subparsers()
-
- sub_single= subparsers.add_parser('single',
- help="ingests a single file URL")
- sub_single.set_defaults(func=run_single_ingest)
- sub_single.add_argument('--release-id',
- help="(optional) existing release ident to match to")
- sub_single.add_argument('--doi',
- help="(optional) existing release DOI to match to")
- sub_single.add_argument('--force-recrawl',
- action='store_true',
- help="ignore GWB history and use SPNv2 to re-crawl")
- sub_single.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_single.add_argument('--ingest-type',
- default="pdf",
- help="type of ingest (pdf, html, etc)")
- sub_single.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_single.add_argument('url',
- help="URL of paper to fetch")
-
- sub_requests = subparsers.add_parser('requests',
- help="takes a series of ingest requests (JSON, per line) and runs each")
- sub_requests.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_requests.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_requests.set_defaults(func=run_requests)
- sub_requests.add_argument('json_file',
- help="JSON file (request per line) to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
-
- sub_api = subparsers.add_parser('api',
- help="starts a simple HTTP server that processes ingest requests")
- sub_api.set_defaults(func=run_api)
- sub_api.add_argument('--port',
- help="HTTP port to listen on",
- default=8033, type=int)
-
- args = parser.parse_args()
- if not args.__dict__.get("func"):
- parser.print_help(file=sys.stderr)
- sys.exit(-1)
-
- args.func(args)
-
-if __name__ == '__main__':
- main()
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
new file mode 100755
index 0000000..0b74f9f
--- /dev/null
+++ b/python/ingest_tool.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import subprocess
+import sys
+from http.server import HTTPServer
+
+import sentry_sdk
+
+from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
+from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
+from sandcrawler.ingest_fileset import IngestFilesetWorker
+
+
+def run_single_ingest(args):
+ request = dict(
+ ingest_type=args.ingest_type,
+ base_url=args.url,
+ ext_ids=dict(doi=args.doi),
+ fatcat=dict(release_ident=args.release_id),
+ )
+ if args.force_recrawl:
+ request["force_recrawl"] = True
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ ingester = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ ingest_file_result_stdout=True,
+ )
+ else:
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ grobid_client=grobid_client,
+ )
+ result = ingester.process(request)
+ print(json.dumps(result, sort_keys=True))
+ return result
+
+
+def run_requests(args):
+ # TODO: switch to using JsonLinePusher
+ file_worker = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
+ fileset_worker = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ )
+ for line in args.json_file:
+ request = json.loads(line.strip())
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ result = fileset_worker.process(request)
+ else:
+ result = file_worker.process(request)
+ print(json.dumps(result, sort_keys=True))
+
+
+def run_file_requests_backfill(args):
+ """
+ Special mode for persisting GROBID and pdfextract results to Kafka, but
+ printing ingest result to stdout.
+
+ Can be used to batch re-process known files.
+ """
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
+ worker = IngestFileWorker(
+ grobid_client=grobid_client,
+ sink=None,
+ grobid_sink=grobid_sink,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ try_spn2=False,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
+
+def run_spn_status(args):
+ worker = IngestFileWorker(
+ sink=None,
+ try_spn2=False,
+ )
+
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system")
+ resp.raise_for_status()
+ print(f"System status: {json.dumps(resp.json(), sort_keys=True)}")
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user")
+ resp.raise_for_status()
+ print(f"User status: {json.dumps(resp.json(), sort_keys=True)}")
+
+
+def run_api(args):
+ port = 8083
+ print("Listening on localhost:{}".format(port))
+ server = HTTPServer(("", port), IngestFileRequestHandler)
+ server.serve_forever()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
+ parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)")
+ subparsers = parser.add_subparsers()
+
+ sub_single = subparsers.add_parser("single", help="ingests a single base URL")
+ sub_single.set_defaults(func=run_single_ingest)
+ sub_single.add_argument(
+ "ingest_type", default="pdf", help="type of ingest (pdf, html, etc)"
+ )
+ sub_single.add_argument(
+ "--release-id", help="(optional) existing release ident to match to"
+ )
+ sub_single.add_argument("--doi", help="(optional) existing release DOI to match to")
+ sub_single.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="ignore GWB history and use SPNv2 to re-crawl",
+ )
+ sub_single.add_argument("--no-spn2", action="store_true", help="don't use live web (SPNv2)")
+ sub_single.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_single.add_argument("url", help="URL of paper to fetch")
+ sub_single.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_requests = subparsers.add_parser(
+ "requests", help="takes a series of ingest requests (JSON, per line) and runs each"
+ )
+ sub_requests.add_argument(
+ "--no-spn2", action="store_true", help="don't use live web (SPNv2)"
+ )
+ sub_requests.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_requests.set_defaults(func=run_requests)
+ sub_requests.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ sub_api = subparsers.add_parser(
+ "api", help="starts a simple HTTP server that processes ingest requests"
+ )
+ sub_api.set_defaults(func=run_api)
+ sub_api.add_argument("--port", help="HTTP port to listen on", default=8033, type=int)
+
+ sub_file_requests_backfill = subparsers.add_parser(
+ "file-requests-backfill",
+ help="starts a simple HTTP server that processes ingest requests",
+ )
+ sub_file_requests_backfill.set_defaults(func=run_file_requests_backfill)
+ sub_file_requests_backfill.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_file_requests_backfill.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ sub_file_requests_backfill.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_spn_status = subparsers.add_parser(
+ "spn-status", help="checks save-page-now v2 API status for bot user"
+ )
+ sub_spn_status.set_defaults(func=run_spn_status)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 10a0f48..28d6397 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -1,15 +1,11 @@
#!/usr/bin/env python3
-
"""
KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
"""
-import sys
-import json
import argparse
-import datetime
+import sys
-from grobid2json import teixml2json
from sandcrawler import *
@@ -20,10 +16,13 @@ def run_extract_json(args):
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
wayback_client = WaybackClient()
if args.jobs > 1:
@@ -33,19 +32,22 @@ def run_extract_cdx(args):
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
- worker = PdfExtractWorker(wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
pusher = CdxLinePusher(
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_extract_zipfile(args):
if args.jobs > 1:
print("multi-processing: {}".format(args.jobs), file=sys.stderr)
@@ -57,9 +59,10 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+
def run_single(args):
worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
- with open(args.pdf_file, 'rb') as pdf_file:
+ with open(args.pdf_file, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
worker.push_record(pdf_bytes)
worker.finish()
@@ -67,51 +70,55 @@ def run_single(args):
args.thumbnail_sink.finish()
-
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json',
- help="for each JSON line with CDX info, fetches PDF and does PDF extraction")
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does PDF extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx',
- help="for each CDX line, fetches PDF and does PDF extraction")
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does PDF extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
- help="opens zipfile, iterates over PDF files inside and does PDF extract for each")
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does PDF extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
- sub_single = subparsers.add_parser('single',
- help="opens single PDF and extracts it")
+ sub_single = subparsers.add_parser("single", help="opens single PDF and extracts it")
sub_single.set_defaults(func=run_single)
- sub_single.add_argument('pdf_file',
- help="single PDF to extract",
- type=str)
+ sub_single.add_argument("pdf_file", help="single PDF to extract", type=str)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -123,17 +130,22 @@ def main():
if args.kafka_mode:
text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
- args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
- produce_topic=text_topic)
- args.thumbnail_sink = KafkaSink(kafka_hosts=args.kafka_hosts,
- produce_topic=thumbnail_topic)
- print("Running in kafka output mode, publishing to {} and {}\n".format(
- text_topic, thumbnail_topic), file=sys.stderr)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=text_topic)
+ args.thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic
+ )
+ print(
+ "Running in kafka output mode, publishing to {} and {}\n".format(
+ text_topic, thumbnail_topic
+ ),
+ file=sys.stderr,
+ )
else:
args.sink = None
args.thumbnail_sink = None
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 5cffa8c..24b749d 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -1,18 +1,15 @@
#!/usr/bin/env python3
-
"""
Basically just a copy of grobid_tool.py, but for PDF classification instead of
text extraction.
Example of large parallel run, locally:
-cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
"""
-import sys
-import json
import argparse
-import datetime
+import sys
from sandcrawler import *
@@ -21,37 +18,47 @@ def run_classify_pdf_json(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_classify_pdf_cdx(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
wayback_client = WaybackClient()
if args.jobs > 1:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = CdxLinePusher(
multi_worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
batch_size=args.jobs,
)
else:
- worker = PdfTrioWorker(pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode)
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
pusher = CdxLinePusher(
worker,
args.cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher.run()
+
def run_classify_pdf_zipfile(args):
pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
@@ -60,48 +67,59 @@ def run_classify_pdf_zipfile(args):
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--pdftrio-host',
- default="http://pdftrio.qa.fatcat.wiki",
- help="pdftrio API host/port")
- parser.add_argument('--pdftrio-mode',
- default="auto",
- help="which classification mode to use")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--pdftrio-host", default="http://pdftrio.qa.fatcat.wiki", help="pdftrio API host/port"
+ )
+ parser.add_argument(
+ "--pdftrio-mode", default="auto", help="which classification mode to use"
+ )
subparsers = parser.add_subparsers()
- sub_classify_pdf_json = subparsers.add_parser('classify-pdf-json',
- help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_json = subparsers.add_parser(
+ "classify-pdf-json",
+ help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion",
+ )
sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json)
- sub_classify_pdf_json.add_argument('json_file',
+ sub_classify_pdf_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_classify_pdf_cdx = subparsers.add_parser('classify-pdf-cdx',
- help="for each CDX line, fetches PDF and does pdftrio classify_pdfion")
+ sub_classify_pdf_cdx = subparsers.add_parser(
+ "classify-pdf-cdx",
+ help="for each CDX line, fetches PDF and does pdftrio classify_pdfion",
+ )
sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx)
- sub_classify_pdf_cdx.add_argument('cdx_file',
+ sub_classify_pdf_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_classify_pdf_zipfile = subparsers.add_parser('classify-pdf-zipfile',
- help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each")
+ sub_classify_pdf_zipfile = subparsers.add_parser(
+ "classify-pdf-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each",
+ )
sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile)
- sub_classify_pdf_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to classify",
- type=str)
+ sub_classify_pdf_zipfile.add_argument(
+ "zip_file", help="zipfile with PDFs to classify", type=str
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -112,10 +130,10 @@ def main():
if args.kafka_mode:
produce_topic = "sandcrawler-{}.pdftrio-output".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 69e9374..e08d66c 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
@@ -7,9 +6,9 @@ Normally this is done by workers (in sandcrawler_worker.py) consuming from
Kafka feeds, but sometimes we have bulk processing output we want to backfill.
"""
+import argparse
import os
import sys
-import argparse
from sandcrawler import *
from sandcrawler.persist import *
@@ -19,7 +18,7 @@ def run_cdx(args):
worker = PersistCdxWorker(
db_url=args.db_url,
)
- filter_mimetypes = ['application/pdf']
+ filter_mimetypes = ["application/pdf"]
if args.no_mimetype_filter:
filter_mimetypes = None
pusher = CdxLinePusher(
@@ -27,11 +26,12 @@ def run_cdx(args):
args.cdx_file,
filter_http_statuses=[200, 226],
filter_mimetypes=filter_mimetypes,
- #allow_octet_stream
+ # allow_octet_stream
batch_size=200,
)
pusher.run()
+
def run_grobid(args):
worker = PersistGrobidWorker(
db_url=args.db_url,
@@ -49,6 +49,7 @@ def run_grobid(args):
)
pusher.run()
+
def run_grobid_disk(args):
"""
Writes XML to individual files on disk, and also prints non-XML metadata to
@@ -63,6 +64,7 @@ def run_grobid_disk(args):
)
pusher.run()
+
def run_pdftrio(args):
worker = PersistPdfTrioWorker(
db_url=args.db_url,
@@ -74,6 +76,7 @@ def run_pdftrio(args):
)
pusher.run()
+
def run_pdftext(args):
worker = PersistPdfTextWorker(
db_url=args.db_url,
@@ -91,6 +94,7 @@ def run_pdftext(args):
)
pusher.run()
+
def run_ingest_file_result(args):
worker = PersistIngestFileResultWorker(
db_url=args.db_url,
@@ -102,6 +106,7 @@ def run_ingest_file_result(args):
)
pusher.run()
+
def run_ingest_request(args):
worker = PersistIngestRequestWorker(
db_url=args.db_url,
@@ -113,92 +118,186 @@ def run_ingest_request(args):
)
pusher.run()
+
+def run_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
+def run_grobid_refs(args):
+ worker = PersistGrobidRefsWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--db-url',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--db-url",
help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (seaweedfs) backend URL",
- default="localhost:9000")
- parser.add_argument('--s3-access-key',
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
- parser.add_argument('--s3-secret-key',
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
- parser.add_argument('--s3-bucket',
- help="S3 (seaweedfs) bucket to persist into",
- default="sandcrawler-dev")
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
subparsers = parser.add_subparsers()
- sub_cdx = subparsers.add_parser('cdx',
- help="backfill a CDX file into postgresql cdx table")
+ sub_cdx = subparsers.add_parser("cdx", help="backfill a CDX file into postgresql cdx table")
sub_cdx.set_defaults(func=run_cdx)
- sub_cdx.add_argument('cdx_file',
+ sub_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_cdx.add_argument('--no-mimetype-filter',
- action='store_true',
- help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
+ type=argparse.FileType("r"),
+ )
+ sub_cdx.add_argument(
+ "--no-mimetype-filter",
+ action="store_true",
+ help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)",
+ )
- sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_grobid = subparsers.add_parser(
+ "grobid", help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_grobid.set_defaults(func=run_grobid)
- sub_grobid.add_argument('json_file',
+ sub_grobid.add_argument(
+ "json_file",
help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_grobid.add_argument('--db-only',
- action='store_true',
- help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
-
- sub_pdftext = subparsers.add_parser('pdftext',
- help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ type=argparse.FileType("r"),
+ )
+ sub_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_pdftext = subparsers.add_parser(
+ "pdftext", help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_pdftext.set_defaults(func=run_pdftext)
- sub_pdftext.add_argument('json_file',
+ sub_pdftext.add_argument(
+ "json_file",
help="pdftext file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_pdftext.add_argument('--db-only',
- action='store_true',
- help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
-
- sub_grobid_disk = subparsers.add_parser('grobid-disk',
- help="dump GRBOID output to (local) files on disk")
+ type=argparse.FileType("r"),
+ )
+ sub_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_grobid_disk = subparsers.add_parser(
+ "grobid-disk", help="dump GRBOID output to (local) files on disk"
+ )
sub_grobid_disk.set_defaults(func=run_grobid_disk)
- sub_grobid_disk.add_argument('json_file',
+ sub_grobid_disk.add_argument(
+ "json_file",
help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid_disk.add_argument('output_dir',
- help="base directory to output into",
- type=str)
+ type=argparse.FileType("r"),
+ )
+ sub_grobid_disk.add_argument("output_dir", help="base directory to output into", type=str)
- sub_pdftrio = subparsers.add_parser('pdftrio',
- help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_pdftrio = subparsers.add_parser(
+ "pdftrio", help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
sub_pdftrio.set_defaults(func=run_pdftrio)
- sub_pdftrio.add_argument('json_file',
+ sub_pdftrio.add_argument(
+ "json_file",
help="pdftrio file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
- help="backfill a ingest_file_result JSON dump into postgresql")
+ sub_ingest_file_result = subparsers.add_parser(
+ "ingest-file-result", help="backfill a ingest_file_result JSON dump into postgresql"
+ )
sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
- sub_ingest_file_result.add_argument('json_file',
+ sub_ingest_file_result.add_argument(
+ "json_file",
help="ingest_file_result file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_ingest_request = subparsers.add_parser('ingest-request',
- help="backfill a ingest_request JSON dump into postgresql")
+ sub_ingest_request = subparsers.add_parser(
+ "ingest-request", help="backfill a ingest_request JSON dump into postgresql"
+ )
sub_ingest_request.set_defaults(func=run_ingest_request)
- sub_ingest_request.add_argument('json_file',
+ sub_ingest_request.add_argument(
+ "json_file",
help="ingest_request to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
+
+ sub_crossref = subparsers.add_parser(
+ "crossref",
+ help="backfill a crossref JSON dump into postgresql, and extract references at the same time",
+ )
+ sub_crossref.set_defaults(func=run_crossref)
+ sub_crossref.add_argument(
+ "json_file",
+ help="crossref file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+
+ sub_grobid_refs = subparsers.add_parser(
+ "grobid-refs", help="backfill a grobid_refs JSON dump into postgresql"
+ )
+ sub_grobid_refs.set_defaults(func=run_grobid_refs)
+ sub_grobid_refs.add_argument(
+ "json_file",
+ help="grobid_refs to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -207,5 +306,6 @@ def main():
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..2cef007
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.isort]
+profile = "black"
+line_length = 96
diff --git a/python/pytest.ini b/python/pytest.ini
index 034a68e..18e8cf0 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -17,5 +17,10 @@ filterwarnings =
ignore::DeprecationWarning:.*wayback
ignore::DeprecationWarning:.*PIL
ignore::DeprecationWarning:.*justext
+ ignore::DeprecationWarning:.*internetarchive
+ ignore::DeprecationWarning:.*minio
+ ignore::DeprecationWarning:.*base_reporter
+ ignore::DeprecationWarning:.*loccache
+ ignore:.*pytz-deprecation-shim
log_level = INFO
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index e461462..469c2a2 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,10 +1,49 @@
-
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
-from .ingest import IngestFileWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (
+ CdxApiClient,
+ CdxApiError,
+ CdxPartial,
+ CdxRow,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WarcResource,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .ingest_file import IngestFileWorker
+from .ingest_fileset import IngestFilesetWorker
+from .misc import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_datetime,
+ parse_cdx_line,
+)
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (
+ PersistCdxWorker,
+ PersistGrobidDiskWorker,
+ PersistGrobidWorker,
+ PersistIngestFileResultWorker,
+ PersistIngestRequestWorker,
+ PersistPdfTextWorker,
+ PersistPdfTrioWorker,
+ PersistThumbnailWorker,
+)
+from .workers import (
+ BlackholeSink,
+ CdxLinePusher,
+ JsonLinePusher,
+ KafkaCompressSink,
+ KafkaJsonPusher,
+ KafkaSink,
+ MultiprocessWrapper,
+ ZipfilePusher,
+)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 066e53b..f9018ec 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,51 +1,58 @@
-
-import json
import datetime
-from typing import Optional
+import json
+from typing import Any, Dict, List, Optional, Tuple
import psycopg2
import psycopg2.extras
-import requests
-class SandcrawlerPostgrestClient:
+from .misc import requests_retry_session
- def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs):
+
+class SandcrawlerPostgrestClient:
+ def __init__(self, api_url: str = "http://wbgrp-svc506.us.archive.org:3030", **kwargs):
self.api_url = api_url
+ self.http_session = requests_retry_session()
- def get_cdx(self, url):
- resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
+ def get_cdx(self, url: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/cdx", params=dict(url="eq." + url))
resp.raise_for_status()
return resp.json() or None
- def get_grobid(self, sha1):
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+ def get_grobid(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_pdftrio(self, sha1):
- resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.'+sha1))
+ def get_pdftrio(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
- def get_pdf_meta(self, sha1):
- resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_pdf_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
def get_html_meta(self, sha1hex: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/html_meta",
params=dict(sha1hex=f"eq.{sha1hex}"),
)
@@ -56,17 +63,19 @@ class SandcrawlerPostgrestClient:
else:
return None
- def get_file_meta(self, sha1):
- resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
+ def get_file_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
else:
return None
def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/ingest_file_result",
params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
)
@@ -77,27 +86,76 @@ class SandcrawlerPostgrestClient:
else:
return None
-class SandcrawlerPostgresClient:
+ def get_ingest_fileset_platform(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/ingest_fileset_platform",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/crossref", params=dict(doi=f"eq.{doi}"))
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref_with_refs(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/crossref_with_refs", params=dict(doi=f"eq.{doi}")
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
- def __init__(self, db_url, **kwargs):
+ def get_grobid_refs(self, source: str, source_id: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid_refs",
+ params=dict(source=f"eq.{source}", source_id=f"eq.{source_id}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+
+class SandcrawlerPostgresClient:
+ def __init__(self, db_url: str, **kwargs):
self.conn = psycopg2.connect(db_url)
- def cursor(self):
+ def cursor(self) -> psycopg2.extensions.cursor:
return self.conn.cursor()
- def commit(self):
- return self.conn.commit()
+ def commit(self) -> None:
+ self.conn.commit()
- def _inserts_and_updates(self, resp, on_conflict):
- resp = [int(r[0]) for r in resp]
- inserts = len([r for r in resp if r == 0])
+ def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]:
+ resp_codes = [int(r[0]) for r in resp]
+ inserts = len([r for r in resp_codes if r == 0])
if on_conflict == "update":
- updates = len([r for r in resp if r != 0])
+ updates = len([r for r in resp_codes if r != 0])
else:
updates = 0
return (inserts, updates)
- def insert_cdx(self, cur, batch, on_conflict="nothing"):
+ def insert_cdx(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
@@ -110,26 +168,35 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d for d in batch if d.get('warc_path')]
+ batch = [d for d in batch if d.get("warc_path")]
if not batch:
return (0, 0)
- batch = [(d['url'],
- d['datetime'],
- d['sha1hex'],
- d['mimetype'],
- d['warc_path'],
- int(d['warc_csize']),
- int(d['warc_offset']))
- for d in batch]
+ rows = [
+ (
+ d["url"],
+ d["datetime"],
+ d["sha1hex"],
+ d["mimetype"],
+ d["warc_path"],
+ int(d["warc_csize"]),
+ int(d["warc_offset"]),
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (url, datetime)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_file_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_file_meta(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
@@ -148,21 +215,24 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [(d['sha1hex'],
- d['sha256hex'],
- d['md5hex'],
- int(d['size_bytes']),
- d['mimetype'])
- for d in batch]
+ rows = [
+ (d["sha1hex"], d["sha256hex"], d["md5hex"], int(d["size_bytes"]), d["mimetype"])
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_grobid(self, cur, batch, on_conflict="nothing"):
+ def insert_grobid(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
@@ -184,33 +254,39 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
for r in batch:
- if r.get('metadata'):
+ if r.get("metadata"):
# sometimes these are only in metadata; shouldn't pass through
# though (to save database space)
- dupe_fields = ('fatcat_release', 'grobid_version')
+ dupe_fields = ("fatcat_release", "grobid_version")
for k in dupe_fields:
- if not k in r:
- r[k] = r['metadata'].get(k)
- r['metadata'].pop(k, None)
- r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
- batch = [(d['key'],
- d.get('grobid_version') or None,
- d['status_code'],
- d['status'],
- d.get('fatcat_release') or None,
- d.get('updated') or datetime.datetime.now(),
- d.get('metadata') or None ,
- )
- for d in batch]
+ if k not in r:
+ r[k] = r["metadata"].get(k)
+ r["metadata"].pop(k, None)
+ r["metadata"] = json.dumps(r["metadata"], sort_keys=True)
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["key"],
+ d.get("grobid_version") or None,
+ d["status_code"],
+ d["status"],
+ d.get("fatcat_release") or None,
+ d.get("updated") or now,
+ d.get("metadata") or None,
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_pdf_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_pdf_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -239,16 +315,17 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d.to_sql_tuple() for d in batch]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_html_meta(self, cur, batch, on_conflict="nothing"):
+ def insert_html_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -274,16 +351,20 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d.to_sql_tuple() for d in batch]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_pdftrio(self, cur, batch, on_conflict="nothing"):
+ def insert_pdftrio(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
@@ -309,29 +390,36 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [
+ now = datetime.datetime.now()
+ rows = [
(
- d['key'],
- d.get('updated') or datetime.datetime.now(),
- d['status_code'],
- d['status'],
- d.get('versions', {}).get('pdftrio_version') or None,
- d.get('versions', {}).get('models_date') or None,
- d.get('ensemble_score'),
- d.get('bert_score'),
- d.get('linear_score'),
- d.get('image_score'),
+ d["key"],
+ d.get("updated") or now,
+ d["status_code"],
+ d["status"],
+ d.get("versions", {}).get("pdftrio_version") or None,
+ d.get("versions", {}).get("models_date") or None,
+ d.get("ensemble_score"),
+ d.get("bert_score"),
+ d.get("linear_score"),
+ d.get("image_score"),
)
- for d in batch]
+ for d in batch
+ ]
# filter out duplicate rows by key (sha1hex)
- batch_dict = dict()
- for b in batch:
- batch_dict[b[0]] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_ingest_request(self, cur, batch, on_conflict="nothing"):
+ def insert_ingest_request(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
@@ -345,35 +433,43 @@ class SandcrawlerPostgresClient:
sql += " RETURNING xmax;"
for r in batch:
# in case these fields were already packed into 'request'
- extra = r.get('request', {})
- for k in ('ext_ids', 'fatcat_release', 'edit_extra', 'rel'):
+ extra = r.get("request", {})
+ for k in ("ext_ids", "fatcat_release", "edit_extra", "rel"):
if r.get(k):
extra[k] = r[k]
if extra:
- r['extra'] = json.dumps(extra, sort_keys=True)
- batch = [(d['link_source'],
- d['link_source_id'],
- d['ingest_type'],
- d['base_url'],
- d.get('ingest_request_source'),
- d.get('release_stage') or None,
- d.get('extra') or None,
- )
- for d in batch]
+ r["extra"] = json.dumps(extra, sort_keys=True)
+ rows = [
+ (
+ d["link_source"],
+ d["link_source_id"],
+ d["ingest_type"],
+ d["base_url"],
+ d.get("ingest_request_source"),
+ d.get("release_stage") or None,
+ d.get("extra") or None,
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1], b[2], b[3])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1], b[2], b[3])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
- def insert_ingest_file_result(self, cur, batch, on_conflict="nothing"):
+ def insert_ingest_file_result(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
VALUES %s
- ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
+ ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
"""
if on_conflict.lower() == "nothing":
sql += " NOTHING"
@@ -390,20 +486,165 @@ class SandcrawlerPostgresClient:
else:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [(d['ingest_type'],
- d['base_url'],
- bool(d['hit']),
- d['status'],
- d.get('terminal_url'),
- d.get('terminal_dt'),
- d.get('terminal_status_code'),
- d.get('terminal_sha1hex'),
- )
- for d in batch]
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("terminal_url"),
+ d.get("terminal_dt"),
+ d.get("terminal_status_code"),
+ d.get("terminal_sha1hex"),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_fileset_platform(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_fileset_platform_pkeypkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=now(),
+ hit=EXCLUDED.hit,
+ status=EXCLUDED.status,
+ platform_name=EXCLUDED.platform_name,
+ platform_domain=EXCLUDED.platform_domain,
+ platform_id=EXCLUDED.platform_id,
+ ingest_strategy=EXCLUDED.ingest_strategy,
+ total_size=EXCLUDED.total_size,
+ file_count=EXCLUDED.file_count,
+ archiveorg_item_name=EXCLUDED.archiveorg_item_name,
+ archiveorg_item_bundle_path=EXCLUDED.archiveorg_item_bundle_path,
+ web_bundle_url=EXCLUDED.web_bundle_url,
+ web_bundle_dt=EXCLUDED.web_bundle_dt,
+ manifest=EXCLUDED.manifest
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("platform_name"),
+ d.get("platform_domain"),
+ d.get("platform_id"),
+ d.get("ingest_strategy"),
+ d.get("total_size"),
+ d.get("file_count"),
+ d.get("archiveorg_item_name"),
+ d.get("archiveorg_item_bundle_path"),
+ d.get("web_bundle_url"),
+ d.get("web_bundle_dt"),
+ d.get("manifest"),
+ )
+ for d in batch
+ ]
# filter out duplicate rows by key (ingest_type, base_url)
- batch_dict = dict()
- for b in batch:
- batch_dict[(b[0], b[1])] = b
- batch = list(batch_dict.values())
- resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_crossref(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ crossref (doi, indexed, record)
+ VALUES %s
+ ON CONFLICT (doi) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ indexed=EXCLUDED.indexed,
+ record=EXCLUDED.record
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["doi"],
+ d.get("indexed") or None,
+ json.dumps(d["record"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_grobid_refs(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ grobid_refs (source, source_id, source_ts, updated, refs_json)
+ VALUES %s
+ ON CONFLICT (source, source_id) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ source_ts=EXCLUDED.source_ts,
+ updated=EXCLUDED.updated,
+ refs_json=EXCLUDED.refs_json
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["source"],
+ d["source_id"],
+ d.get("source_ts") or None,
+ d.get("updated") or now,
+ json.dumps(d["refs_json"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
new file mode 100644
index 0000000..5c13318
--- /dev/null
+++ b/python/sandcrawler/fileset_platforms.py
@@ -0,0 +1,832 @@
+import urllib.parse
+from typing import Optional, Tuple
+
+import internetarchive
+
+from sandcrawler.fileset_types import (
+ FilesetManifestFile,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+from sandcrawler.misc import requests_retry_session
+
+
+class FilesetPlatformHelper:
+ def __init__(self):
+ self.platform_name = "unknown"
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ """
+ Does this request look like it matches this platform?
+ """
+ raise NotImplementedError()
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+ raise NotImplementedError()
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ assert item.manifest
+ total_size = sum([m.size or 0 for m in item.manifest]) or 0
+ largest_size = max([m.size or 0 for m in item.manifest]) or 0
+ if len(item.manifest) == 1:
+ if total_size < 64 * 1024 * 1024:
+ return IngestStrategy.WebFile
+ else:
+ return IngestStrategy.ArchiveorgFile
+ else:
+ if largest_size < 64 * 1024 * 1024 and total_size < 128 * 1024 * 1024 * 1024:
+ return IngestStrategy.WebFileset
+ else:
+ return IngestStrategy.ArchiveorgFileset
+
+
+class DataverseHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "dataverse"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_dataverse_persistentid(pid: str) -> dict:
+ """
+ Parses a persistentId into 5 sections:
+
+ - type (doi or hdl)
+ - authority (eg, DOI prefix)
+ - shoulder (optional, eg 'DVN')
+ - dataset_id (6-digit)
+ - file_id
+
+ The returned dict always has all components, which may be 'None' if optional.
+
+ This is possible because the dataverse software only supports a handful
+ of configurations and persistend identifier types.
+
+ If there is an error parsing, raises a ValueError
+ """
+ id_type = None
+ if pid.startswith("doi:10."):
+ id_type = "doi"
+ pid = pid[4:]
+ elif pid.startswith("hdl:"):
+ id_type = "hdl"
+ pid = pid[4:]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ comp = pid.split("/")
+ if len(comp) < 2:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ authority = comp[0]
+ shoulder = None
+ dataset_id = None
+ file_id = None
+ if len(comp[1]) != 6 and len(comp) == 3:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ elif len(comp[1]) != 6 and len(comp) == 4:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ file_id = comp[3]
+ elif len(comp[1]) == 6 and len(comp) == 2:
+ dataset_id = comp[1]
+ elif len(comp[1]) == 6 and len(comp) == 3:
+ dataset_id = comp[1]
+ file_id = comp[2]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ if len(dataset_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse dataset id: {dataset_id}")
+ if file_id and len(file_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse file id: {file_id}")
+
+ return {
+ "type": id_type,
+ "authority": authority,
+ "shoulder": shoulder,
+ "dataset_id": dataset_id,
+ "file_id": file_id,
+ }
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: could also do HTML platform detection or something?
+
+ components = urllib.parse.urlparse(url)
+ # platform_domain = components.netloc.split(':')[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not id_param:
+ return False
+ platform_id = id_param[0]
+
+ try:
+ self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ return False
+
+ return True
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+
+
+ HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not (id_param and id_param[0]):
+ raise PlatformScopeError("Expected a Dataverse persistentId in URL")
+ platform_id = id_param[0]
+ version_param = params.get("version")
+ dataset_version = None
+ if version_param:
+ dataset_version = version_param[0]
+
+ try:
+ parsed_id = self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ raise PlatformScopeError("not actually in scope")
+
+ if parsed_id["file_id"]:
+ # TODO: maybe we could support this?
+ raise PlatformScopeError(
+ "only entire dataverse datasets can be archived with this tool"
+ )
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ if not dataset_version:
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+ if "latestVersion" not in obj["data"]:
+ raise PlatformScopeError("could not find latest version for dataverse record")
+ obj_latest = obj["data"]["latestVersion"]
+ dataset_version = (
+ f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ obj_latest = obj["data"]["latestVersion"]
+ assert (
+ dataset_version
+ == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+ assert platform_id == obj_latest["datasetPersistentId"]
+
+ manifest = []
+ for row in obj_latest["files"]:
+ df = row["dataFile"]
+ df_persistent_id = df["persistentId"]
+ platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}"
+ if df.get("originalFileName"):
+ platform_url += "&format=original"
+
+ extra = dict()
+ # TODO: always save the version field?
+ if row.get("version") != 1:
+ extra["version"] = row["version"]
+ if "description" in df:
+ extra["description"] = df["description"]
+ manifest.append(
+ FilesetManifestFile(
+ path=df.get("originalFileName") or df["filename"],
+ size=df.get("originalFileSize") or df["filesize"],
+ md5=df["md5"],
+ # NOTE: don't get: sha1, sha256
+ mimetype=df["contentType"],
+ platform_url=platform_url,
+ extra=extra or None,
+ )
+ )
+
+ platform_sub_id = platform_id.split("/")[-1]
+ archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ date=obj_latest["releaseTime"].split("T")[0],
+ source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
+ )
+ if platform_id.startswith("doi:10."):
+ archiveorg_item_meta["doi"] = platform_id.replace("doi:", "")
+ for block in obj_latest["metadataBlocks"]["citation"]["fields"]:
+ if block["typeName"] == "title":
+ archiveorg_item_meta["title"] = block["value"]
+ elif block["typeName"] == "depositor":
+ archiveorg_item_meta["creator"] = block["value"]
+ elif block["typeName"] == "dsDescription":
+ archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][
+ "value"
+ ]
+
+ archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "")
+ if obj_latest.get("termsOfUse"):
+ archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"]
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_dataverse_persistentid() -> None:
+
+ valid = {
+ "doi:10.25625/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.25625",
+ "shoulder": None,
+ "dataset_id": "LL6WXZ",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": "LL6WXZ",
+ },
+ "hdl:20.500.12690/RIN/IDDOAH/BTNH25": {
+ "type": "hdl",
+ "authority": "20.500.12690",
+ "shoulder": "RIN",
+ "dataset_id": "IDDOAH",
+ "file_id": "BTNH25",
+ },
+ "doi:10.7910/DVN/6HPRIG": {
+ "type": "doi",
+ "authority": "10.7910",
+ "shoulder": "DVN",
+ "dataset_id": "6HPRIG",
+ "file_id": None,
+ },
+ }
+
+ invalid = [
+ # "doi:10.5072/FK2/J8SJZB/LL6WXZ",
+ "doi:10.25625/abcd",
+ "other:10.25625/LL6WXZ",
+ "10.25625/LL6WXZ",
+ "doi:10.5072/FK2/J8SJZB/LL6WXZv123",
+ ]
+
+ for pid, val in valid.items():
+ assert DataverseHelper.parse_dataverse_persistentid(pid) == val
+
+ for pid in invalid:
+ try:
+ DataverseHelper.parse_dataverse_persistentid(pid)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class FigshareHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "figshare"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_figshare_url_path(path: str) -> Tuple[str, Optional[str]]:
+ """
+ Tries to parse a figshare URL into ID number and (optional) version number.
+
+ Returns a two-element tuple; version number will be None if not found
+
+ Raises a ValueError if not a figshare URL
+ """
+ # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+ # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
+
+ comp = path.split("/")
+ if len(comp) < 4 or comp[1] != "articles":
+ raise ValueError(f"not a figshare URL: {path}")
+
+ comp = comp[2:]
+ if comp[0] in [
+ "dataset",
+ # TODO: should the following be considered "out of scope"?
+ "journal_contribution",
+ "presentation",
+ "poster",
+ "thesis",
+ ]:
+ comp = comp[1:]
+
+ if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
+ return (comp[1], comp[2])
+ elif len(comp) == 2 and comp[1].isdigit():
+ return (comp[1], None)
+ else:
+ raise ValueError(f"couldn't find figshare identiier: {path}")
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ # only work with full, versioned figshare.com URLs
+ if "figshare.com" not in platform_domain:
+ return False
+
+ try:
+ parsed = self.parse_figshare_url_path(components.path)
+ except ValueError:
+ return False
+
+ # has file component
+ if parsed[0] and parsed[1]:
+ return True
+
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ (platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+ assert (
+ dataset_version and dataset_version.isdigit()
+ ), f"expected numeric: {dataset_version}"
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ # TODO: implement this code path
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ # figshare_type = obj['defined_type_name']
+
+ if not obj["is_public"]:
+ raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}")
+ if obj["is_embargoed"]:
+ raise PlatformRestrictedError(
+ f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})'
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ manifest.append(
+ FilesetManifestFile(
+ path=row["name"],
+ size=row["size"],
+ md5=row["computed_md5"],
+ # NOTE: don't get: sha1, sha256, mimetype
+ platform_url=row["download_url"],
+ # extra=dict(),
+ )
+ )
+ if row.get("is_link_only"):
+ raise PlatformScopeError(
+ f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}"
+ )
+
+ authors = []
+ for author in obj["authors"]:
+ authors.append(author["full_name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["title"],
+ date=obj["published_date"],
+ source=obj["url_public_html"],
+ description=obj["description"],
+ license=obj["license"]["url"],
+ version=obj["version"],
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_figshare_url_path() -> None:
+
+ valid = {
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": (
+ "8987858",
+ "1",
+ ),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": (
+ "8987858",
+ None,
+ ),
+ "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+ "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": (
+ "12127176",
+ "4",
+ ),
+ "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": (
+ "1407386",
+ "3",
+ ),
+ "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": (
+ "1094056",
+ "1",
+ ),
+ }
+
+ invalid = [
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species",
+ ]
+
+ for path, val in valid.items():
+ assert FigshareHelper.parse_figshare_url_path(path) == val
+
+ for path in invalid:
+ try:
+ FigshareHelper.parse_figshare_url_path(path)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class ZenodoHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "zenodo"
+ self.session = requests_retry_session()
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if platform_domain == "zenodo.org" and "/record/" in components.path:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: also look in base_url and resource-non-terminal for ident? to
+ # check for work-level redirects
+
+ # 1. extract identifier from URL
+ # eg: https://zenodo.org/record/5230255
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if len(components.path.split("/")) < 2:
+ raise PlatformScopeError("Expected a complete, versioned figshare URL")
+
+ platform_id = components.path.split("/")[2]
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+
+ if "zenodo.org" not in platform_domain:
+ raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
+
+ # 2. API fetch
+ resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}", timeout=60.0)
+ if resp.status_code == 410:
+ raise PlatformRestrictedError("record deleted")
+ resp.raise_for_status()
+ obj = resp.json()
+
+ assert obj["id"] == int(platform_id)
+ work_id = obj["conceptrecid"]
+ if work_id == obj["id"]:
+ raise PlatformScopeError(
+ "got a work-level zenodo record, not a versioned record: {work_id}"
+ )
+
+ # zenodo_type = obj['metadata']['resource_type']['type']
+
+ if obj["metadata"]["access_right"] != "open":
+ raise PlatformRestrictedError(
+ "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}"
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ mf = FilesetManifestFile(
+ path=row["key"],
+ size=row["size"],
+ platform_url=row["links"]["self"],
+ # extra=dict(),
+ )
+ checksum = row["checksum"]
+ # eg: md5:35ffcab905f8224556dba76648cb7dad
+ if checksum.startswith("md5:"):
+ mf.md5 = checksum[4:]
+ elif checksum.startswith("sha1:"):
+ mf.sha1 = checksum[45]
+ manifest.append(mf)
+
+ authors = []
+ for author in obj["metadata"]["creators"]:
+ authors.append(author["name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["metadata"]["title"],
+ date=obj["metadata"]["publication_date"],
+ source=obj["links"]["html"],
+ description=obj["metadata"]["description"],
+ license=obj["metadata"]["license"]["id"],
+ version=obj["revision"],
+ # obj['metadata']['version'] is, eg, git version tag
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=obj["revision"]),
+ )
+
+
+class ArchiveOrgHelper(FilesetPlatformHelper):
+
+ FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "archiveorg"
+ self.session = internetarchive.get_session()
+
+ @staticmethod
+ def want_item_file(f: internetarchive.File, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in [
+ "_academictorrents.torrent",
+ "_academictorrents_torrent.txt",
+ ".bib",
+ ]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+ patterns = [
+ "://archive.org/details/",
+ "://archive.org/download/",
+ ]
+ for p in patterns:
+ if p in url:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ base_url_split = request["base_url"].split("/")
+ # print(base_url_split, file=sys.stderr)
+ assert len(base_url_split) in [5, 6]
+ assert base_url_split[0] in ["http:", "https:"]
+ assert base_url_split[2] == "archive.org"
+ assert base_url_split[3] in ["details", "download"]
+ item_name = base_url_split[4]
+ if len(base_url_split) == 6 and base_url_split[5]:
+ raise PlatformScopeError(
+ "got an archive.org file path, not download/details page; individual files not handled yet"
+ )
+
+ # print(f" archiveorg processing item={item_name}", file=sys.stderr)
+ item = self.session.get_item(item_name)
+ item_name = item.identifier
+ item_collection = item.metadata["collection"]
+ if type(item_collection) == list:
+ item_collection = item_collection[0]
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ item_files = [f for f in item_files if self.want_item_file(f, item_name)]
+ manifest = []
+ for f in item_files:
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = FilesetManifestFile(
+ path=f.name,
+ size=int(f.size),
+ sha1=f.sha1,
+ md5=f.md5,
+ mimetype=self.FORMAT_TO_MIMETYPE[f.format],
+ platform_url=f"https://archive.org/download/{item_name}/{f.name}",
+ )
+ manifest.append(mf)
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain="archive.org",
+ platform_id=item_name,
+ archiveorg_item_name=item_name,
+ archiveorg_meta=dict(collection=item_collection),
+ )
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ """
+ Don't use default strategy picker; we are always doing an 'existing' in this case.
+ """
+ assert item.manifest is not None
+ if len(item.manifest) == 1:
+ # NOTE: code flow does not support ArchiveorgFilesetBundle for the
+ # case of, eg, a single zipfile in an archive.org item
+ return IngestStrategy.ArchiveorgFile
+ elif len(item.manifest) >= 1:
+ return IngestStrategy.ArchiveorgFileset
+ else:
+ raise NotImplementedError("empty dataset")
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
new file mode 100644
index 0000000..1d84ce5
--- /dev/null
+++ b/python/sandcrawler/fileset_strategies.py
@@ -0,0 +1,387 @@
+import os
+import shutil
+import sys
+from typing import Optional
+
+import internetarchive
+import requests
+
+from sandcrawler.fileset_types import (
+ ArchiveStrategyResult,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformScopeError,
+)
+from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
+from sandcrawler.misc import (
+ gen_file_metadata,
+ gen_file_metadata_path,
+ requests_retry_session,
+ sanitize_fs_path,
+)
+
+
+class FilesetIngestStrategy:
+ def __init__(self):
+ # self.ingest_strategy = 'unknown'
+ self.success_status = "success"
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ raise NotImplementedError()
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ raise NotImplementedError()
+
+
+class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+
+ # TODO: enable cleanup when confident (eg, safe path parsing)
+ self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True)
+ self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/")
+ try:
+ os.mkdir(self.working_dir)
+ except FileExistsError:
+ pass
+
+ self.http_session = requests_retry_session()
+ self.ia_session = internetarchive.get_session(
+ config={
+ "s3": {
+ "access": os.environ.get("IA_ACCESS_KEY"),
+ "secret": os.environ.get("IA_SECRET_KEY"),
+ },
+ }
+ )
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ """
+ use API to check for item with all the files in the manifest
+
+ NOTE: this naive comparison is quadratic in number of files, aka O(N^2)
+ """
+ ia_item = self.ia_session.get_item(item.archiveorg_item_name)
+ if not ia_item.exists:
+ return None
+ item_files = ia_item.get_files(on_the_fly=False)
+ assert item.manifest
+ for wanted in item.manifest:
+ found = False
+ for existing in item_files:
+ if existing.name == wanted.path:
+ if (
+ (
+ (existing.sha1 and existing.sha1 == wanted.sha1)
+ or (existing.md5 and existing.md5 == wanted.md5)
+ )
+ and existing.name == wanted.path
+ and existing.size == wanted.size
+ ):
+ found = True
+ wanted.status = "exists"
+ break
+ else:
+ wanted.status = "mismatch-existing"
+ break
+ if not found:
+ print(
+ f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
+ file=sys.stderr,
+ )
+ return None
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status="success-existing",
+ manifest=item.manifest,
+ )
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ May require extra context to pass along to archive.org item creation.
+ """
+ existing = self.check_existing(item)
+ if existing:
+ return existing
+
+ if item.platform_name == "archiveorg":
+ raise PlatformScopeError("shouldn't download archive.org into itself")
+
+ local_dir = self.working_dir + item.archiveorg_item_name
+ assert local_dir.startswith("/")
+ assert local_dir.count("/") > 2
+ try:
+ os.mkdir(local_dir)
+ except FileExistsError:
+ pass
+
+ # 1. download all files locally
+ assert item.manifest
+ for m in item.manifest:
+ if m.path != sanitize_fs_path(m.path):
+ m.status = "unsafe-path"
+ continue
+
+ local_path = local_dir + "/" + m.path
+ assert m.platform_url
+
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ if os.path.exists(local_path):
+ m.status = "exists-local"
+ else:
+ print(f" downloading {m.path}", file=sys.stderr)
+ # create any sub-directories for this path, if necessary
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ try:
+ with self.http_session.get(
+ m.platform_url,
+ stream=True,
+ allow_redirects=True,
+ timeout=2 * 60 * 60,
+ ) as r:
+ r.raise_for_status()
+ with open(local_path + ".partial", "wb") as f:
+ for chunk in r.iter_content(chunk_size=256 * 1024):
+ f.write(chunk)
+ os.rename(local_path + ".partial", local_path)
+ m.status = "downloaded-local"
+ except requests.exceptions.RequestException:
+ m.status = "error-platform-download"
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-platform-download",
+ )
+
+ print(f" verifying {m.path}", file=sys.stderr)
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ if file_meta["size_bytes"] != m.size:
+ print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr)
+ m.status = "mismatch-size"
+ continue
+
+ if m.sha1:
+ if file_meta["sha1hex"] != m.sha1:
+ m.status = "mismatch-sha1"
+ continue
+ else:
+ m.sha1 = file_meta["sha1hex"]
+
+ if m.sha256:
+ if file_meta["sha256hex"] != m.sha256:
+ m.status = "mismatch-sha256"
+ continue
+ else:
+ m.sha256 = file_meta["sha256hex"]
+
+ if m.md5:
+ if file_meta["md5hex"] != m.md5:
+ m.status = "mismatch-md5"
+ continue
+ else:
+ m.md5 = file_meta["md5hex"]
+
+ if m.mimetype:
+ # 'magic' isn't good and parsing more detailed text file formats like text/csv
+ if (
+ file_meta["mimetype"] != m.mimetype
+ and file_meta["mimetype"] != "text/plain"
+ ):
+ # these 'tab-separated-values' from dataverse are just noise, don't log them
+ if m.mimetype != "text/tab-separated-values":
+ print(
+ f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
+ file=sys.stderr,
+ )
+ m.mimetype = file_meta["mimetype"]
+ else:
+ m.mimetype = file_meta["mimetype"]
+ m.status = "verified-local"
+
+ # if verification failed for any individual files, bail out
+ for m in item.manifest:
+ if m.status != "verified-local":
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status=m.status,
+ )
+
+ # 2. upload all files, with metadata
+ assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
+ item_files = {}
+ for m in item.manifest:
+ local_path = local_dir + "/" + m.path
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
+
+ print(
+ f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
+ file=sys.stderr,
+ )
+ try:
+ internetarchive.upload(
+ item.archiveorg_item_name,
+ files=item_files,
+ metadata=item.archiveorg_item_meta,
+ checksum=True,
+ queue_derive=False,
+ verify=True,
+ )
+ except requests.exceptions.RequestException:
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-archiveorg-upload",
+ )
+
+ for m in item.manifest:
+ m.status = "success"
+
+ # 4. delete local directory
+ if not self.skip_cleanup_local_files:
+ shutil.rmtree(local_dir)
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=self.success_status,
+ manifest=item.manifest,
+ )
+
+ return result
+
+
+class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
+ """
+ ArchiveorgFilesetStrategy currently works fine with individual files. Just
+ need to over-ride the ingest_strategy name.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+ self.success_status = "success-file"
+
+
+class WebFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.WebFileset
+ self.wayback_client = WaybackClient()
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+ self.max_spn_manifest = 20
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
+
+ TODO:
+ - full fetch_resource() method which can do SPN requests
+ """
+
+ assert item.manifest
+ file_file_meta = None
+ file_resource = None
+ for m in item.manifest:
+ fetch_url = m.platform_url
+ if not fetch_url:
+ raise NotImplementedError(
+ "require 'platform_url' for each file when doing Web fetching"
+ )
+
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
+
+ if self.try_spn2 and (
+ resource is None or (resource and resource.status == "no-capture")
+ ):
+ if len(item.manifest) > self.max_spn_manifest:
+ m.status = "too-much-spn"
+ continue
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(
+ fetch_url, self.wayback_client, force_simple_get=True
+ )
+
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or fetch_url,
+ ),
+ file=sys.stderr,
+ )
+
+ m.terminal_url = resource.terminal_url
+ m.terminal_dt = resource.terminal_dt
+ m.status = resource.status
+ if self.ingest_strategy == "web-file":
+ file_resource = resource
+
+ if resource.status != "success":
+ continue
+ else:
+ assert resource.terminal_status_code == 200
+
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, _html_resource = fix_transfer_encoding(file_meta, resource)
+ except Exception:
+ m.status = "transfer-encoding-error"
+ continue
+
+ if self.ingest_strategy == "web-file":
+ file_file_meta = file_meta
+
+ if (
+ file_meta["size_bytes"] != m.size
+ or (m.md5 and m.md5 != file_meta["md5hex"])
+ or (m.sha1 and m.sha1 != file_meta["sha1hex"])
+ ):
+ m.status = "mismatch"
+ continue
+
+ m.md5 = m.md5 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["sha1hex"]
+ m.sha256 = m.sha256 or file_meta["sha256hex"]
+ m.mimetype = m.mimetype or file_meta["mimetype"]
+
+ overall_status = self.success_status
+ for m in item.manifest:
+ if m.status != "success":
+ overall_status = m.status or "not-processed"
+ break
+ if not item.manifest:
+ overall_status = "empty-manifest"
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=overall_status,
+ manifest=item.manifest,
+ )
+ if self.ingest_strategy == "web-file":
+ result.file_file_meta = file_file_meta
+ result.file_resource = file_resource
+ return result
+
+
+class WebFileStrategy(WebFilesetStrategy):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.ingest_strategy = IngestStrategy.WebFile
+ self.success_status = "success-file"
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
new file mode 100644
index 0000000..3398833
--- /dev/null
+++ b/python/sandcrawler/fileset_types.py
@@ -0,0 +1,74 @@
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel
+
+
+class IngestStrategy(str, Enum):
+ WebFile = "web-file"
+ WebFileset = "web-fileset"
+ WebFilesetBundled = "web-fileset-bundled"
+ ArchiveorgFile = "archiveorg-file"
+ ArchiveorgFileset = "archiveorg-fileset"
+ ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
+
+
+class FilesetManifestFile(BaseModel):
+ path: str
+ size: Optional[int]
+ md5: Optional[str]
+ sha1: Optional[str]
+ sha256: Optional[str]
+ mimetype: Optional[str]
+ extra: Optional[Dict[str, Any]]
+
+ status: Optional[str]
+ platform_url: Optional[str]
+ terminal_url: Optional[str]
+ terminal_dt: Optional[str]
+
+
+class FilesetPlatformItem(BaseModel):
+ platform_name: str
+ platform_status: str
+ platform_domain: Optional[str]
+ platform_id: Optional[str]
+ manifest: Optional[List[FilesetManifestFile]]
+
+ archiveorg_item_name: Optional[str]
+ archiveorg_item_meta: Optional[dict]
+ web_base_url: Optional[str]
+ web_bundle_url: Optional[str]
+
+
+class ArchiveStrategyResult(BaseModel):
+ ingest_strategy: str
+ status: str
+ manifest: List[FilesetManifestFile]
+ file_file_meta: Optional[Dict[str, Any]]
+ file_resource: Optional[Any]
+ bundle_file_meta: Optional[Dict[str, Any]]
+ bundle_resource: Optional[Any]
+ bundle_archiveorg_path: Optional[str]
+
+
+class PlatformScopeError(Exception):
+ """
+ For incidents where platform helper discovers that the fileset/dataset is
+ out-of-cope after already starting to process it.
+
+ For example, attempting to ingest:
+
+ - a 'latest version' record, when the platform has version-specific records
+ - a single file within a dataset for a platform which has file-level identifiers
+ """
+
+ pass
+
+
+class PlatformRestrictedError(Exception):
+ """
+ When datasets are not publicly available on a platform (yet)
+ """
+
+ pass
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b010b2c..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,92 +1,301 @@
+import html
+import sys
+import time
+import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
import requests
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
-from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
-from .misc import gen_file_metadata
+from .ia import WaybackClient
+from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
+MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte
+
+
+def clean_crossref_unstructured(raw: str) -> str:
+ """
+ Applies Crossref-specific cleanups to an 'unstructured' citation string.
+ """
+
+ # detect repeated strings with double space separating them
+ subs = raw.split(" ")
+ if len(subs) == 2 and subs[0] == subs[1]:
+ raw = subs[0]
+ else:
+ raw = " ".join(subs)
+
+ # remove HTML/XML numeric characters
+ if "&#" in raw or "&amp;" in raw or "&gt;" in raw or "&lt;" in raw:
+ raw = html.unescape(raw)
+
+ raw.replace(" ", " ")
+ raw = raw.strip()
+ return raw
+
+
+def test_clean_ref_str() -> None:
+ # NOTE: this as emdash, non-breaking string characters in it
+ raw_with_nbsp = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394. Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ cleaned = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ assert clean_crossref_unstructured(raw_with_nbsp) == cleaned
+
+ # HTML escape characters
+ assert (
+ clean_crossref_unstructured(
+ "J-B Champion, C.Collin, INSEE Premi&#232;re N&#176;1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+ == "J-B Champion, C.Collin, INSEE Première N°1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+
+ # simple doubling
+ assert (
+ clean_crossref_unstructured("https://graph500.org/. https://graph500.org/.")
+ == "https://graph500.org/."
+ )
+ assert (
+ clean_crossref_unstructured(
+ """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg. Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+ == """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+
+ # all non-breaking whitespace
+ assert (
+ clean_crossref_unstructured(
+ "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+ )
+ == ""
+ )
-class GrobidClient(object):
- def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+class GrobidClient(object):
+ def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
- self.consolidate_mode = int(kwargs.get('consolidate_mode', 2))
+ self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
+ self.session = requests_retry_session()
- def process_fulltext(self, blob, consolidate_mode=None):
+ def process_fulltext(
+ self, blob: bytes, consolidate_mode: Optional[int] = None
+ ) -> Dict[str, Any]:
"""
Returns dict with keys:
- status_code
- status (slug)
- error_msg (if status == 'error')
- tei_xml (if status is 200)
-
- TODO: persist connection for performance?
"""
assert blob
- if consolidate_mode == None:
+ if len(blob) > MAX_GROBID_BLOB_SIZE:
+ return {
+ "status": "blob-too-large",
+ "error_msg": f"Not going to process very large file ({len(blob)} bytes)",
+ }
+
+ if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
+ assert consolidate_mode is not None
try:
- grobid_response = requests.post(
+ grobid_response = self.session.post(
self.host_url + "/api/processFulltextDocument",
files={
- 'input': blob,
- 'consolidateHeader': self.consolidate_mode,
- 'consolidateCitations': 0, # too expensive for now
- 'includeRawCitations': 1,
+ "input": blob,
+ },
+ data={
+ "consolidateHeader": consolidate_mode,
+ "consolidateCitations": 0, # too expensive for now
+ "includeRawCitations": 1,
+ "includeRawAffiliations": 1,
+ "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"],
+ "segmentSentences": 1,
},
timeout=180.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'GROBID request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "GROBID request (HTTP POST) timeout",
}
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
- info = dict(
- status_code=grobid_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
- info['status'] = 'success'
- info['tei_xml'] = grobid_response.text
- if len(info['tei_xml']) > 12000000:
+ info["status"] = "success"
+ info["tei_xml"] = grobid_response.text
+ if len(info["tei_xml"]) > 12000000:
# XML is larger than Kafka message size, and much larger than
# an article in general; bail out
- info['status'] = 'error'
- info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
- info.pop('tei_xml')
+ info["status"] = "error"
+ info["error_msg"] = "response XML too large: {} bytes".format(
+ len(info["tei_xml"])
+ )
+ info.pop("tei_xml")
else:
# response.text is .content decoded as utf-8
- info['status'] = 'error'
- info['error_msg'] = grobid_response.text[:10000]
+ info["status"] = "error"
+ info["error_msg"] = grobid_response.text[:10000]
return info
- def metadata(self, result):
- if result['status'] != 'success':
+ def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+ if not unstructured_list:
+ return []
+ if len(unstructured_list) > 5000:
+ raise ValueError("more than 5,000 references in a batch is just too much")
+
+ try:
+ grobid_response = self.session.post(
+ self.host_url + "/api/processCitationList",
+ data={
+ "citations": unstructured_list,
+ "consolidateCitations": 0,
+ "includeRawCitations": 1,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout as te:
+ # TODO: handle somehow?
+ raise te
+
+ grobid_response.raise_for_status()
+ return parse_citation_list_xml(grobid_response.text)
+
+ def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ if result["status"] != "success":
return None
- tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ try:
+ tei_doc = parse_document_xml(result["tei_xml"])
+ except xml.etree.ElementTree.ParseError as pe:
+ result["status"] = "bad-grobid-xml"
+ return dict(error_msg=str(pe)[:1000])
+ tei_doc.remove_encumbered()
+ tei_json = tei_doc.to_legacy_dict()
meta = dict()
biblio = dict()
- for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ for k in (
+ "title",
+ "authors",
+ "journal",
+ "date",
+ "doi",
+ ):
if tei_json.get(k):
biblio[k] = tei_json[k]
- meta['biblio'] = biblio
- for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ meta["biblio"] = biblio
+ for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
if tei_json.get(k):
meta[k] = tei_json[k]
return meta
-class GrobidWorker(SandcrawlerFetchWorker):
+ def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+ """
+ Helper function to decide whether to run GROBID parsing on an crossref
+ reference.
- def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+ For example, if there is already a DOI in the ref metadata, could skip.
+ Or, if there is sufficient structured metadata, or only depending on
+ the source of the DOI linkage.
+ """
+ if ref.get("DOI"):
+ return False
+ if len(ref.get("unstructured", "").strip()) <= 6:
+ return False
+
+ if (
+ ref.get("year")
+ and ref.get("author")
+ and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+ ):
+ return False
+ elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+ return False
+ elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+ return False
+
+ return True
+
+ def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Given a complete Crossref metadata record, inspects the
+
+ The returned dict is in the schema of the `grobid_refs` database table,
+ in dict form:
+
+ source: 'crossref'
+ source_id: doi, as lower-case string
+ source_ts: Crossref indexed timestamp, if available
+ ('updated' is not set)
+ refs_json: list of dicts
+ """
+
+ # remove API wrapper around record, if necessary
+ if "message" in record and "DOI" not in record:
+ record = record["message"]
+
+ ret = dict(
+ source="crossref",
+ source_id=record["DOI"].lower(),
+ source_ts=record["indexed"]["date-time"],
+ refs_json=[],
+ )
+ all_refs = record.get("reference", [])
+ unstructured_refs = []
+ for r in all_refs:
+ if not r.get("unstructured"):
+ continue
+ if not self.should_parse_crossref_ref(r):
+ continue
+ unstructured_refs.append(r)
+ if not unstructured_refs:
+ return ret
+
+ # some reasonable cap on length of refs per work
+ if len(unstructured_refs) > 2000:
+ print(
+ f"truncating very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+ file=sys.stderr,
+ )
+ unstructured_refs = unstructured_refs[:2000]
+
+ clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
+ refs = self.process_citation_list(clean_refs)
+
+ assert len(refs) == len(unstructured_refs)
+ refs_json = []
+ for i in range(len(refs)):
+ refs[i].id = unstructured_refs[i].get("key")
+ refs[i].index = None
+ refs_json.append(refs[i].to_dict())
+ ret["refs_json"] = refs_json
+ return ret
+
+
+class GrobidWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ grobid_client: GrobidClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def timeout_response(self, task):
- default_key = task['sha1hex']
+ def timeout_response(self, task: Any) -> Any:
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
error_msg="internal GROBID worker timeout",
@@ -94,37 +303,74 @@ class GrobidWorker(SandcrawlerFetchWorker):
key=default_key,
)
- def process(self, record, key=None):
- default_key = record['sha1hex']
-
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['source'] = record
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["source"] = record
+ result["key"] = result["file_meta"]["sha1hex"]
return result
+
+class CrossrefRefsWorker(SandcrawlerWorker):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.grobid_client = grobid_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ # handle the rare case of bad TEI-XML response
+ # eg: https://github.com/kermitt2/grobid/issues/848
+ try:
+ return self.grobid_client.crossref_refs(record)
+ except xml.etree.ElementTree.ParseError:
+ print(
+ f"GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+ file=sys.stderr,
+ )
+ # but add a small slow-down so we don't churn through these if
+ # GROBID is just misconfigured or something
+ time.sleep(3)
+ return None
+ except requests.exceptions.HTTPError:
+ print(f"GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+ except requests.exceptions.ReadTimeout:
+ print(f"GROBID HTTP timeout for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+
+
class GrobidBlobWorker(SandcrawlerWorker):
"""
This is sort of like GrobidWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, grobid_client, sink=None, **kwargs):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
super().__init__()
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
return result
-
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 14561bf..207f067 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,18 +1,20 @@
-
+import json
import re
import sys
-import json
import urllib.parse
+from typing import Any, Dict
from bs4 import BeautifulSoup
-RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"')
+RESEARCHSQUARE_REGEX = re.compile(
+ r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"'
+)
IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
-def extract_fulltext_url(html_url, html_body):
+def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
"""
Takes an HTML document (and URL), assumed to be a landing page, and tries
to find a fulltext PDF url.
@@ -20,9 +22,9 @@ def extract_fulltext_url(html_url, html_body):
On error, or if fails to extract a URL, returns an empty dict.
"""
- host_prefix = '/'.join(html_url.split('/')[:3])
+ host_prefix = "/".join(html_url.split("/")[:3])
try:
- soup = BeautifulSoup(html_body, 'html.parser')
+ soup = BeautifulSoup(html_body, "html.parser")
except TypeError as te:
print(f"{te} (url={html_url})", file=sys.stderr)
return dict()
@@ -30,75 +32,75 @@ def extract_fulltext_url(html_url, html_body):
print(f"{ule} (url={html_url})", file=sys.stderr)
return dict()
- ### General Tricks ###
+ # ignoring most type checks on bs4 output in this function (which is partially deprecated)
+ meta: Any
+ url: Any
+ redirect: Any
- # highwire-style meta tag
- meta = soup.find('meta', attrs={"name":"citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
- if not meta:
- # researchgate does this; maybe others also?
- meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
- if not meta:
- meta = soup.find('meta', attrs={"name":"eprints.document_url"})
- # if tag is only partially populated
- if meta and not meta.get('content'):
- meta = None
- # wiley has a weird almost-blank page we don't want to loop on
- if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
- url = meta['content'].strip()
- if '://doi.org/' in url:
- print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
- elif url.startswith('/'):
- if host_prefix+url == html_url:
- print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
- elif url.startswith('http'):
- if url == html_url:
- print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=url, technique='citation_pdf_url')
- else:
- print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+ ### General Tricks ###
+ # note: most of these have migrated to the html_biblio code path
- meta = soup.find('meta', attrs={"name":"generator"})
+ meta = soup.find("meta", attrs={"name": "generator"})
meta_generator = None
- if meta and meta.get('content'):
- meta_generator = meta['content'].strip()
+ if meta and meta.get("content"):
+ meta_generator = meta["content"].strip()
### Publisher/Platform Specific ###
# research square (researchsquare.com)
- if 'researchsquare.com/article/' in html_url:
+ if "researchsquare.com/article/" in html_url:
# JSON in body with a field like:
# "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf"
- m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8'))
+ m = RESEARCHSQUARE_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(release_stage="manuscript", pdf_url=url, technique='publisher')
+ return dict(release_stage="manuscript", pdf_url=url, technique="publisher")
# elseiver linking hub
# https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975
- if '://linkinghub.elsevier.com/retrieve/pii/' in html_url:
+ if "://linkinghub.elsevier.com/retrieve/pii/" in html_url:
# <input type="hidden" name="redirectURL" value="http%3A%2F%2Fcysticfibrosisjournal.com%2Fretrieve%2Fpii%2FS1569199319308975" id="redirectURL"/>
redirect = soup.find("input", attrs={"name": "redirectURL"})
if redirect:
- url = redirect['value'].strip()
- if 'http' in url:
+ url = redirect["value"].strip()
+ if "http" in url:
url = urllib.parse.unquote(url)
# drop any the query parameter
- url = url.split('?via')[0]
+ url = url.split("?via")[0]
return dict(next_url=url, technique="elsevier-linkinghub")
+ # sciencedirect PDF URL extract
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+ if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"):
+ json_tag: Any = soup.find(
+ "script", attrs={"type": "application/json", "data-iso-key": "_0"}
+ )
+ url = None
+ if json_tag:
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta["article"]["pdfDownload"]["urlMetadata"]
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = (
+ html_url
+ + pdf_meta["pdfExtension"]
+ + "?md5="
+ + pdf_meta["queryParams"]["md5"]
+ + "&pid="
+ + pdf_meta["queryParams"]["pid"]
+ )
+ except (KeyError, TypeError, json.JSONDecodeError):
+ pass
+ if url:
+ return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
# sciencedirect PDF bounce page
# https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
- if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
+ if "://www.sciencedirect.com/" in html_url and html_url.endswith(".pdf"):
# window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client';
- m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode('utf-8'))
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4000
@@ -106,85 +108,108 @@ def extract_fulltext_url(html_url, html_body):
# ieeexplore.ieee.org
# https://ieeexplore.ieee.org/document/8730316
- if '://ieeexplore.ieee.org/document/' in html_url:
+ if "://ieeexplore.ieee.org/document/" in html_url:
# JSON in body with a field like:
# "pdfPath":"/iel7/6287639/8600701/08730316.pdf",
- m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8'))
+ m = IEEEXPLORE_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(release_stage="published", pdf_url=host_prefix+url, technique="ieeexplore")
+ return dict(
+ release_stage="published", pdf_url=host_prefix + url, technique="ieeexplore"
+ )
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
- if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url:
+ if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url:
# HTML iframe like:
# <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
- iframe = soup.find("iframe")
- if iframe and '.pdf' in iframe['src']:
- return dict(pdf_url=iframe['src'], technique="iframe")
+ iframe: Any = soup.find("iframe")
+ if iframe and ".pdf" in iframe["src"]:
+ return dict(pdf_url=iframe["src"], technique="iframe")
# https://insights.ovid.com/crossref?an=00042307-202001000-00013
# Ovid is some kind of landing page bounce portal tracking run-around.
# Can extract actual journal URL from javascript blob in the HTML
- if '://insights.ovid.com/crossref' in html_url:
+ if "://insights.ovid.com/crossref" in html_url:
# var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
- m = OVID_JOURNAL_URL_REGEX.search(html_body.decode('utf-8'))
+ m = OVID_JOURNAL_URL_REGEX.search(html_body.decode("utf-8"))
if m:
url = m.group(1)
assert len(url) < 4096
- return dict(next_url=url, technique='ovid')
+ return dict(next_url=url, technique="ovid")
# osf.io
# https://osf.io/8phvx/
# https://osf.io/preprints/socarxiv/8phvx/
# wow, they ship total javascript crud! going to just guess download URL
# based on URL for now. Maybe content type header would help?
- if '://osf.io/' in html_url and not '/download' in html_url:
- if not html_url.endswith("/"):
- next_url = html_url+"/download"
- else:
- next_url = html_url+"download"
- return dict(next_url=next_url, technique='osf-by-url')
+ OSF_DOMAINS = [
+ "://osf.io/",
+ "://biohackrxiv.org/",
+ "://psyarxiv.com/",
+ "://arabixiv.org/",
+ "://engrxiv.org/",
+ "://edarxiv.org//",
+ "://ecsarxiv.org/",
+ "://ecoevorxiv.org/",
+ "://frenxiv.org/",
+ "://indiarxiv.org/",
+ "://mindrxiv.org/",
+ "://mediarxiv.org/",
+ "://paleorxiv.org/",
+ "://thesiscommons.org/",
+ ]
+ for domain in OSF_DOMAINS:
+ if (
+ domain in html_url
+ and (len(html_url.split("/")) in [4, 5] or "/preprints/" in html_url)
+ and "/download" not in html_url
+ ):
+ if not html_url.endswith("/"):
+ next_url = html_url + "/download"
+ else:
+ next_url = html_url + "download"
+ return dict(next_url=next_url, technique="osf-by-url")
# wiley
# https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787
if "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
if b"/doi/pdfdirect/" in html_body:
- next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/')
- return dict(next_url=next_url, technique='wiley-pdfdirect')
+ next_url = html_url.replace("/doi/pdf/", "/doi/pdfdirect/")
+ return dict(next_url=next_url, technique="wiley-pdfdirect")
# arxiv abstract pages
if "://arxiv.org/abs/" in html_url:
url = html_url.replace("/abs/", "/pdf/")
- return dict(pdf_url=url, technique='arxiv-url')
+ return dict(pdf_url=url, technique="arxiv-url")
# american archivist (OA)
# https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
- if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+ if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url:
# use a more aggressive direct guess to avoid rate-limiting...
if "/doi/10." in html_url:
url = html_url.replace("/doi/10.", "/doi/pdf/10.")
- return dict(pdf_url=url, technique='archivist-url')
+ return dict(pdf_url=url, technique="archivist-url")
# <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
- hrefs = soup.find_all('a', attrs={"target":"_blank"})
+ hrefs = soup.find_all("a", attrs={"target": "_blank"})
for href in hrefs:
- url = href['href'].strip()
+ url = href["href"].strip()
if "/doi/pdf/" in url:
- if url.startswith('http'):
- return dict(pdf_url=url, technique='publisher-href')
- elif url.startswith('/'):
- return dict(pdf_url=host_prefix+url, technique='publisher-href')
+ if url.startswith("http"):
+ return dict(pdf_url=url, technique="publisher-href")
+ elif url.startswith("/"):
+ return dict(pdf_url=host_prefix + url, technique="publisher-href")
# protocols.io
# https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"):
url = html_url + ".pdf"
- return dict(pdf_url=url, technique='protocolsio-url')
+ return dict(pdf_url=url, technique="protocolsio-url")
# degruyter.com
# https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml
if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"):
- url = html_url.replace('/view/', '/downloadpdf/').replace('.xml', '.pdf')
- return dict(pdf_url=url, technique='degruyter-url')
+ url = html_url.replace("/view/", "/downloadpdf/").replace(".xml", ".pdf")
+ return dict(pdf_url=url, technique="degruyter-url")
# journals.lww.com (Wolters Kluwer)
# https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx
@@ -192,116 +217,141 @@ def extract_fulltext_url(html_url, html_body):
# we never get the content.
if "://journals.lww.com/" in html_url and False:
# data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
- for line in html_body.split(b'\n'):
+ for line in html_body.split(b"\n"):
if b"data-pdf-url=" in line:
- line = line.decode('utf-8')
- url = line.strip().replace('data-pdf-url=', '').replace('"', '')
- if url.startswith('http') and 'pdfs.journals.lww.com' in url:
- return dict(pdf_url=url, technique='journals.lww.com-jsvar')
+ line = line.decode("utf-8")
+ url = line.strip().replace("data-pdf-url=", "").replace('"', "")
+ if url.startswith("http") and "pdfs.journals.lww.com" in url:
+ return dict(pdf_url=url, technique="journals.lww.com-jsvar")
# www.ahajournals.org
# https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
- if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+ if "://www.ahajournals.org/doi/" in html_url and "/doi/pdf/" not in html_url:
# <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
- if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/10.', '/doi/pdf/10.')
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
url = url + "?download=true"
- return dict(pdf_url=url, technique='ahajournals-url')
+ return dict(pdf_url=url, technique="ahajournals-url")
# ehp.niehs.nih.gov
# https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
# https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
if "://ehp.niehs.nih.gov/doi/" in html_url:
# <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
- if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
- return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/full/10.", "/doi/pdf/10.").replace(
+ "/doi/10.", "/doi/pdf/10."
+ )
+ return dict(pdf_url=url, technique="ehp.niehs.nigh.gov-url")
# cogentoa.com
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
- if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+ if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url:
# blech, it's a SPA! All JS
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
url = html_url + ".pdf"
- return dict(pdf_url=url, technique='cogentoa-url')
+ return dict(pdf_url=url, technique="cogentoa-url")
# chemrxiv.org (likely to be other figshare domains also)
# https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419
- if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url:
+ if "://chemrxiv.org/articles/" in html_url or ".figshare.org/articles/" in html_url:
# <script id="app-data" type="text/json"> [...] </script>
- json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"})
+ json_tag = soup.find("script", id="app-data", attrs={"type": "text/json"})
if json_tag and json_tag.string:
app_data = json.loads(json_tag.string)
# "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf"
- url = app_data.get('article', {}).get('exportPdfDownloadUrl')
- if url and url.startswith('http'):
- return dict(pdf_url=url, technique='figshare-json')
+ url = app_data.get("article", {}).get("exportPdfDownloadUrl")
+ if url and url.startswith("http"):
+ return dict(pdf_url=url, technique="figshare-json")
# CNKI COVID-19 landing pages
# http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
- if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
+ if "://en.gzbd.cnki.net/KCMS/detail/detail.aspx" in html_url:
# <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA; "><i></i>PDF Download</a>
- href = soup.find('a', attrs={"id":"pdfDown"})
+ href = soup.find("a", attrs={"id": "pdfDown"})
if href:
- url = href['href'].strip().replace('&#xA;', '')
- if not url.startswith('http'):
+ url = href["href"].strip().replace("&#xA;", "")
+ if not url.startswith("http"):
url = host_prefix + url
- return dict(pdf_url=url, technique='cnki-href')
+ return dict(pdf_url=url, technique="cnki-href")
# RWTH AACHEN repository
- if '://publications.rwth-aachen.de/record/' in html_url:
- record_id = html_url.split('/')[-1]
+ if "://publications.rwth-aachen.de/record/" in html_url:
+ record_id = html_url.split("/")[-1]
url = f"{html_url}/files/{record_id}.pdf"
- if record_id.isdigit() and url.encode('utf-8') in html_body:
- return dict(pdf_url=url, technique='rwth-aachen-url')
+ if record_id.isdigit() and url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="rwth-aachen-url")
# physchemaspects.ru
- if '://physchemaspects.ru/' in html_url and soup:
- for href in soup.find_all('a'):
+ if "://physchemaspects.ru/" in html_url and soup:
+ for href in soup.find_all("a"):
if href.text == "download PDF file":
- url = href['href']
- if url.startswith('/'):
+ url = href["href"]
+ if url.startswith("/"):
url = host_prefix + url
- return dict(pdf_url=url, technique='physchemaspects-href')
+ return dict(pdf_url=url, technique="physchemaspects-href")
# OJS 3 (some)
if meta_generator and meta_generator.startswith("Open Journal Systems"):
- href = soup.find('a', attrs={"class":"obj_galley_link file"})
+ href = soup.find("a", attrs={"class": "obj_galley_link file"})
if href and href.text and "pdf" in href.text.lower():
- url = href['href'].strip()
- if url.startswith('/'):
+ url = href["href"].strip()
+ if url.startswith("/"):
url = host_prefix + url
- return dict(pdf_url=url, technique='ojs-galley-href')
+ return dict(pdf_url=url, technique="ojs-galley-href")
# ETH zurich e-periodica
- if '://www.e-periodica.ch/digbib/view' in html_url:
- url = html_url.replace('digbib/view', 'cntmng').split('#')[0]
- if url.encode('utf-8') in html_body:
- return dict(pdf_url=url, technique='href-eperiodica')
+ if "://www.e-periodica.ch/digbib/view" in html_url:
+ url = html_url.replace("digbib/view", "cntmng").split("#")[0]
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="href-eperiodica")
# JMIR
# https://mhealth.jmir.org/2020/7/e17891/
- if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"):
+ if ".jmir.org/" in html_url and "/pdf" not in html_url and html_url.endswith("/"):
url = html_url + "pdf"
- return dict(pdf_url=url, technique='jmir-url')
+ return dict(pdf_url=url, technique="jmir-url")
+
+ # Google Drive
+ # this is assuming it is a PDF
+ if "drive.google.com/file/d/" in html_url and "/view" in html_url:
+ gdrive_id = html_url.split("/")[5]
+ if len(gdrive_id) > 10:
+ # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
+
+ # https://doi.org/10.24850/j-tyca-14-4-7
+ # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+ if "docs.google.com/viewer?url=" in html_url:
+ original_url = html_url.split("?url=")[1]
+ if original_url:
+ return dict(pdf_url=original_url, technique="docs.google.com viewer")
### below here we are doing guesses
# generic guess: try current URL plus .pdf, if it exists in the HTML body
- if not '.pdf' in html_url:
+ if ".pdf" not in html_url:
url = html_url + ".pdf"
- if url.encode('utf-8') in html_body:
- return dict(pdf_url=url, technique='guess-url-plus-pdf')
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="guess-url-plus-pdf")
return dict()
-def test_regex():
+
+def test_regex() -> None:
lines = """
blah
var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
asdf"""
m = OVID_JOURNAL_URL_REGEX.search(lines)
- assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+ assert m
+ assert (
+ m.group(1)
+ == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+ )
lines = """
window.onload = function () {
@@ -311,4 +361,5 @@ def test_regex():
"""
url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+ assert m
assert m.group(1) == url
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index f9f48a6..1e2d197 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,15 @@
-
-import sys
import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
+import braveblock
import dateparser
-from selectolax.parser import HTMLParser
import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
from sandcrawler.misc import url_fuzzy_equal
-
# this is a map of metadata keys to CSS selectors
# sources for this list include:
# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
@@ -22,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal
# order of these are mostly by preference/quality (best option first), though
# also/sometimes re-ordered for lookup efficiency (lookup stops after first
# match)
-HEAD_META_PATTERNS: Any = {
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
"title": [
"meta[name='citation_title']",
"meta[name='eprints.title']",
@@ -159,7 +157,7 @@ HEAD_META_PATTERNS: Any = {
],
}
-HEAD_META_LIST_PATTERNS: Any = {
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
"contrib_names": [
"meta[name='citation_author']",
"meta[name='bepress_citation_author']",
@@ -180,7 +178,7 @@ HEAD_META_LIST_PATTERNS: Any = {
],
}
-XML_FULLTEXT_PATTERNS: List[dict] = [
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "meta[name='citation_xml_url']",
"attr": "content",
@@ -209,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "SciElo XML link",
},
{
- "in_doc_url": "/article/view/",
+ "in_doc_url": "/view/",
"in_fulltext_url": "viewXML",
"selector": "a[class='obj_galley_link']",
"attr": "href",
@@ -222,9 +220,17 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "ARPHA XML link",
"example_page": "https://zookeys.pensoft.net/article/26391",
},
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "xml",
+ "selector": "a.download-files-nlm",
+ "attr": "href",
+ "technique": "XML (NLM) download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
]
-HTML_FULLTEXT_PATTERNS: List[dict] = [
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "meta[name='citation_fulltext_html_url']",
"attr": "content",
@@ -249,11 +255,36 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
"attr": "href",
"technique": "dovepress fulltext link",
},
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+]
+
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "in_doc_url": "pensoft.net/article/", # also /element/
+ "in_fulltext_url": "/download/fig/",
+ "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+ "attr": "href",
+ "technique": "Active figure download link (zookeys)",
+ "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+ },
+ {
+ "in_doc_url": "/file.xhtml?persistentId",
+ "in_fulltext_url": "/access/datafile/",
+ "selector": "div.form-group code",
+ "use_body": "true",
+ "technique": "Dataverse 'download URL'",
+ "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
+ },
]
# This is a database of matching patterns. Most of these discovered by hand,
# looking at OA journal content that failed to craw/ingest.
-PDF_FULLTEXT_PATTERNS: List[dict] = [
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "head meta[name='citation_pdf_url']",
"attr": "content",
@@ -272,7 +303,7 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
},
{
- "selector": "head meta[propery='citation_pdf_url']",
+ "selector": "head meta[property='citation_pdf_url']",
"attr": "content",
"technique": "citation_pdf_url",
# eg, researchgate
@@ -300,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
},
{
- "in_doc_url": "/article/view/",
+ "in_doc_url": "/view/",
"selector": "a#pdfDownloadLink",
"attr": "href",
- "technique": "pdfDownloadLink link",
+ "technique": "OJS pdfDownloadLink link",
"example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
},
{
@@ -375,16 +406,371 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "PDF URL link",
"example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
},
+ {
+ "in_doc_url": "degruyter.com/document/",
+ "in_fulltext_url": "/pdf",
+ "selector": "a.downloadPdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+ },
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
+ {
+ "in_doc_url": "library.wur.nl/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.wl_full_text_restricted",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922",
+ },
+ {
+ "in_doc_url": "/dlibra/",
+ "in_fulltext_url": "pdf",
+ "selector": "iframe#js-main-frame",
+ "attr": "src",
+ "technique": "PDF iframe (dlibra)",
+ "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031",
+ },
+ {
+ "in_doc_url": "/handle/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.misc table.inner tr.b a",
+ "attr": "href",
+ "technique": "PDF URL link (DSpace, first file)",
+ "example_page": "https://orbi.uliege.be/handle/2268/174200",
+ },
+ {
+ "in_doc_url": "/publications/",
+ "in_fulltext_url": "pdf",
+ "selector": ".publication-sidebar li.open-access a.document-link",
+ "attr": "href",
+ "technique": "PDF URL link (Pure repo, OA link)",
+ "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance",
+ },
+ {
+ "in_doc_url": "//hal",
+ "selector": ".widget-openaccess .widget-content a",
+ "attr": "href",
+ "technique": "Fulltext OA URL (HAL)",
+ "example_page": "https://hal.archives-ouvertes.fr/hal-00744951",
+ },
+ {
+ "in_doc_url": "/record/",
+ "in_fulltext_url": "pdf",
+ "selector": "#detailedrecordminipanelfile a",
+ "attr": "href",
+ "technique": "PDF URL link (Invenio)",
+ "example_page": "https://bib-pubdb1.desy.de/record/416556",
+ },
+ {
+ "in_doc_url": "/available/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.file-table a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/",
+ },
+ {
+ "in_doc_url": "/islandora/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.islandora-pdf-link",
+ "attr": "href",
+ "technique": "PDF URL link (Islandora)",
+ "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804",
+ },
+ {
+ "in_doc_url": "/receive/",
+ "in_fulltext_url": "pdf",
+ "selector": ".mir-preview noscript a",
+ "attr": "href",
+ "technique": "PDF iframe via noscript (MyCoRe)",
+ "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191",
+ },
+ {
+ "in_doc_url": "/registro.do",
+ "in_fulltext_url": "imagenes",
+ "selector": ".resumen_bib a[data-analytics=media]",
+ "attr": "href",
+ "technique": "Media link (DIGIBIS)",
+ "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740",
+ },
+ {
+ "in_doc_url": "/view",
+ "in_fulltext_url": "/at_download/",
+ "selector": ".documentContent #content a",
+ "attr": "href",
+ "technique": "Media link (Plone)",
+ "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
+ },
+ {
+ "in_doc_url": "isca-speech.org/",
+ "in_fulltext_url": "pdf",
+ "selector": ".w3-container a",
+ "attr": "href",
+ "technique": "PDF URL link (isca-speech.org)",
+ "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+ },
+ {
+ "in_doc_url": "://repository.dri.ie/",
+ "in_fulltext_url": "/download",
+ "selector": "#dri_download_assets > div > a",
+ "attr": "href",
+ "technique": "Download link (repository.dri.ie)",
+ "example_page": "https://repository.dri.ie/catalog/qf8621102",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.download-files-pdf",
+ "attr": "href",
+ "technique": "PDF Download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+ {
+ "in_doc_url": "cureus.com/",
+ "in_fulltext_url": "pdf",
+ "selector": ".small-medium-pdf a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF Download link (cureus.com)",
+ "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+ },
+ {
+ "in_doc_url": "e-manuscripta.ch/",
+ "in_fulltext_url": "pdf",
+ "selector": "#titleinfoPdfDownload a.resourceLink",
+ "attr": "href",
+ "technique": "PDF Download link (e-manuscripta.ch)",
+ "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+ },
+ {
+ "in_doc_url": "journals.uchicago.edu",
+ "in_fulltext_url": "pdf",
+ "selector": "nav.article__navbar a.ctrl--pdf",
+ "attr": "href",
+ "technique": "PDF Download link (journals.uchicago.edu)",
+ "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
+ },
+ {
+ "in_doc_url": "integrityresjournals.org",
+ "in_fulltext_url": "/article-full-text-pdf/",
+ "selector": "a[target='_blank'].btn-danger",
+ "attr": "href",
+ "technique": "PDF Download link (integrityresjournals.org)",
+ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body.pkp_page_article a.download",
+ "attr": "href",
+ "technique": "OJS PDF Embed",
+ "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/",
+ "selector": "a.pdf",
+ "attr": "href",
+ "technique": "OJS PDF link",
+ },
+ {
+ "in_doc_url": "scitemed.com/article/",
+ "in_fulltext_url": ".pdf",
+ "selector": "li.tab_pdf_btn a",
+ "attr": "href",
+ "technique": "PDF link (scitemed.com)",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+ {
+ "in_doc_url": "/jvi.aspx",
+ "in_fulltext_url": "download_fulltext",
+ "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+ "attr": "href",
+ "technique": "erciyesmedj.com publication system PDF download link",
+ },
+ {
+ "selector": "body embed[alt='pdf']",
+ "attr": "src",
+ "technique": "embed PDF",
+ "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+ },
+ {
+ "in_fulltext_url": "viewPDFInterstitial",
+ "in_doc_url": "/view/",
+ "selector": "frameset frame",
+ "attr": "src",
+ "technique": "PDF iframe (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ # note this one has a special handler
+ "in_doc_url": "viewPDFInterstitial",
+ "in_fulltext_url": "://",
+ "selector": "head meta[http-equiv='refresh']",
+ "attr": "content",
+ "technique": "HTML meta refresh (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ "in_doc_url": "dlib.si/details/",
+ "in_fulltext_url": "PDF",
+ "selector": "body #FilesBox a",
+ "attr": "href",
+ "technique": "dlib.si download links",
+ "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+ },
+ {
+ "in_doc_url": "filclass.ru",
+ "in_fulltext_url": "pdf",
+ "selector": "main .pdf-article a.pdficon",
+ "attr": "href",
+ "technique": "filclass.ru PDF link",
+ "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+ },
+ {
+ "in_doc_url": "cdnsciencepub.com",
+ "in_fulltext_url": "pdf",
+ "selector": "article .info-panel a.btn--pdf",
+ "attr": "href",
+ "technique": "cdnsciencepub.com PDF link",
+ "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+ },
+ {
+ "in_doc_url": "grrjournal.com",
+ "in_fulltext_url": "pdf",
+ "selector": ".ereaders-main-section a[download]",
+ "attr": "href",
+ "technique": "grrjournal.com PDF link",
+ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "pdf",
+ "selector": "#articleFullText a.remote_pdf",
+ "attr": "href",
+ "technique": "OJS remote_pdf link",
+ "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/abs/",
+ "in_fulltext_url": "/reader/",
+ "selector": "article.container .single__download a",
+ "attr": "href",
+ "technique": "worldscientific landing pages",
+ "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/",
+ "in_fulltext_url": "/pdf/",
+ "selector": "noscript a[target='_blank']",
+ "attr": "href",
+ "technique": "worldscientific reader",
+ "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": ".container .view-content .download-article a",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": "body a.download-pdf",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/view/",
+ "selector": "body .entry_details a.pdf",
+ "attr": "href",
+ "technique": "generic OJS/preprints",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body header a.download",
+ "attr": "href",
+ "technique": "generic OJS/preprints PDF Embed",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+ },
]
-FULLTEXT_URL_PATTERNS_SKIP = [
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
# wiley has a weird almost-blank page we don't want to loop on
- "://onlinelibrary.wiley.com/doi/pdf/"
- "://doi.org/"
- "://dx.doi.org/"
+ "://onlinelibrary.wiley.com/doi/pdf/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "{'embed': '",
]
-RELEASE_TYPE_MAP = {
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+ "javascript:",
+ "about:",
+]
+
+RELEASE_TYPE_MAP: Dict[str, str] = {
"research article": "article-journal",
"text.serial.journal": "article-journal",
}
@@ -426,14 +812,15 @@ class BiblioMetadata(pydantic.BaseModel):
pdf_fulltext_url: Optional[str]
html_fulltext_url: Optional[str]
xml_fulltext_url: Optional[str]
+ component_url: Optional[str]
class Config:
- json_encoders = {
- datetime.date: lambda dt: dt.isoformat()
- }
+ json_encoders = {datetime.date: lambda dt: dt.isoformat()}
-def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+def html_extract_fulltext_url(
+ doc_url: str, doc: HTMLParser, patterns: List[dict]
+) -> Optional[Tuple[str, str]]:
"""
Tries to quickly extract fulltext URLs using a set of patterns. This
function is intendend to be generic across various extraction techniques.
@@ -442,49 +829,74 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
"""
self_doc_url: Optional[Tuple[str, str]] = None
for pattern in patterns:
- if not 'selector' in pattern:
+ if "selector" not in pattern:
continue
- if 'in_doc_url' in pattern:
- if not pattern['in_doc_url'] in doc_url:
+ if "in_doc_url" in pattern:
+ if pattern["in_doc_url"] not in doc_url:
continue
- elem = doc.css_first(pattern['selector'])
+ elem = doc.css_first(pattern["selector"])
if not elem:
continue
- if 'attr' in pattern:
- val = elem.attrs.get(pattern['attr'])
- if not val:
+ val = None
+ if "attr" in pattern:
+ val = elem.attrs.get(pattern["attr"])
+ # handle HTML redirect
+ if val and pattern["attr"] == "content" and "URL=" in val:
+ val = val.split("URL=")[1]
+ elif pattern.get("use_body"):
+ val = elem.text()
+ if "://" not in val:
continue
- val = urllib.parse.urljoin(doc_url, val)
- assert val
- if 'in_fulltext_url' in pattern:
- if not pattern['in_fulltext_url'] in val:
- continue
- for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
- if skip_pattern in val.lower():
- continue
- if url_fuzzy_equal(doc_url, val):
- # don't link to self, unless no other options
- self_doc_url = (val, pattern.get('technique', 'unknown'))
+ if not val:
+ continue
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ if "in_fulltext_url" in pattern:
+ if pattern["in_fulltext_url"] not in val:
continue
- return (val, pattern.get('technique', 'unknown'))
+ skip_matched = False
+ for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+ if skip_pattern in val.lower():
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+ if val.lower().startswith(skip_pattern):
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ if url_fuzzy_equal(doc_url, val):
+ # don't link to self, unless no other options
+ self_doc_url = (val, pattern.get("technique", "unknown"))
+ continue
+
+ # quirks modes / hacks
+ if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+ val = val[:-1]
+
+ return (val, pattern.get("technique", "unknown"))
if self_doc_url:
- print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
return self_doc_url
return None
+
def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
meta: Any = dict()
head = doc.css_first("head")
if not head:
+ print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
return None
for field, patterns in HEAD_META_PATTERNS.items():
for pattern in patterns:
val = head.css_first(pattern)
- #print((field, pattern, val))
- if val and 'content' in val.attrs and val.attrs['content']:
- meta[field] = val.attrs['content']
+ # print((field, pattern, val))
+ if val and "content" in val.attrs and val.attrs["content"]:
+ meta[field] = val.attrs["content"]
break
for field, patterns in HEAD_META_LIST_PATTERNS.items():
@@ -492,53 +904,57 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
val_list = head.css(pattern)
if val_list:
for val in val_list:
- if 'content' in val.attrs and val.attrs['content']:
- if not field in meta:
+ if "content" in val.attrs and val.attrs["content"]:
+ if field not in meta:
meta[field] = []
- meta[field].append(val.attrs['content'])
+ meta[field].append(val.attrs["content"])
break
# (some) fulltext extractions
pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
if pdf_fulltext_url:
- meta['pdf_fulltext_url'] = pdf_fulltext_url[0]
+ meta["pdf_fulltext_url"] = pdf_fulltext_url[0]
xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
if xml_fulltext_url:
- meta['xml_fulltext_url'] = xml_fulltext_url[0]
+ meta["xml_fulltext_url"] = xml_fulltext_url[0]
html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
if html_fulltext_url:
- meta['html_fulltext_url'] = html_fulltext_url[0]
+ meta["html_fulltext_url"] = html_fulltext_url[0]
+ component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+ if component_url:
+ meta["component_url"] = component_url[0]
# TODO: replace with clean_doi() et al
- if meta.get('doi') and meta.get('doi').startswith('doi:'):
- meta['doi'] = meta['doi'][4:]
+ if meta.get("doi") and meta.get("doi").startswith("doi:"):
+ meta["doi"] = meta["doi"][4:]
- raw_identifiers = meta.pop('raw_identifiers', [])
+ raw_identifiers = meta.pop("raw_identifiers", [])
for ident in raw_identifiers:
- if ident.startswith('doi:10.'):
- if not 'doi' in meta:
- meta['doi'] = ident.replace('doi:', '')
- elif ident.startswith('10.') and '/' in ident:
- if not 'doi' in meta:
- meta['doi'] = ident
- elif ident.startswith('isbn:'):
- if not 'isbn' in meta:
- meta['isbn'] = ident.replace('isbn:', '')
-
- raw_date = meta.pop('raw_date', None)
+ if ident.startswith("doi:10."):
+ if "doi" not in meta:
+ meta["doi"] = ident.replace("doi:", "")
+ elif ident.startswith("10.") and "/" in ident:
+ if "doi" not in meta:
+ meta["doi"] = ident
+ elif ident.startswith("isbn:"):
+ if "isbn" not in meta:
+ meta["isbn"] = ident.replace("isbn:", "")
+
+ raw_date = meta.pop("raw_date", None)
if raw_date:
parsed = dateparser.parse(raw_date)
if parsed:
- meta['release_date'] = parsed.date()
+ meta["release_date"] = parsed.date()
- raw_release_type = meta.pop('raw_release_type', None)
+ raw_release_type = meta.pop("raw_release_type", None)
if raw_release_type:
release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
if release_type:
- meta['release_type'] = release_type
+ meta["release_type"] = release_type
return BiblioMetadata(**meta)
+
def load_adblock_rules() -> braveblock.Adblocker:
"""
TODO: consider blocking very generic assets:
@@ -561,46 +977,67 @@ def load_adblock_rules() -> braveblock.Adblocker:
"||pbs.twimg.com^",
"||badge.dimensions.ai^",
"||recaptcha.net^",
-
+ "||tag.imagino.com^",
+ "||consent.cookiebot.com^",
+ "||recaptcha.net^",
# not sure about these CC badges (usually via a redirect)
- #"||licensebuttons.net^",
- #"||i.creativecommons.org^",
-
+ # "||licensebuttons.net^",
+ # "||i.creativecommons.org^",
# Should we skip jquery, or other generic javascript CDNs?
- #"||code.jquery.com^",
- #"||ajax.googleapis.com^",
- #"||cdnjs.cloudflare.com^",
-
+ # "||code.jquery.com^",
+ # "||ajax.googleapis.com^",
+ # "||cdnjs.cloudflare.com^",
# badges, "share" buttons, tracking, etc
"apis.google.com/js/plusone",
"www.google.com/recaptcha/",
"js/_getUACode.js"
-
# PLOS images
"/resource/img/icon.*.16.png^",
+ # CAIRN broken tracking tag
+ "cairn-int.info//about.php?cairn_guest=",
],
)
-def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+def _extract_generic(
+ doc: HTMLParser, selector: str, attrs: List[str], type_name: str
+) -> List[Dict[str, str]]:
resources = []
for node in doc.css(selector):
for attr in attrs:
- if not attr in node.attrs:
+ if attr not in node.attrs:
continue
url = node.attrs.get(attr)
+ # special-case a couple meta URI prefixes which don't match with adblock rules
+ skip = False
+ for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
+ if url and url.startswith(prefix):
+ skip = True
+ break
+ if url and "/" not in url and "." not in url and " " in url:
+ # eg: "Ce fichier n'existe pas"
+ skip = True
+ if skip:
+ continue
+ if url and url.startswith("https://https://"):
+ url = url[8:]
+ elif url and url.startswith("http://http://"):
+ url = url[7:]
if url:
- resources.append(dict(url=url, type=type_name))
+ # print(url, file=sys.stderr)
+ resources.append(dict(url=url.strip(), type=type_name))
return resources
-def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list:
+def html_extract_resources(
+ doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker
+) -> List[Dict[str, str]]:
"""
This function tries to find all the important resources in a page. The
presumption is that the HTML document is article fulltext, and we want the
- list of all resoures (by URL) necessary to replay the page.
+ list of all resources (by URL) necessary to replay the page.
The returned resource URLs each have a type (script, img, css, etc), and
should be fully-qualified URLs (not relative).
@@ -624,13 +1061,17 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Ad
# ensure URLs are absolute
for r in resources:
- r['url'] = urllib.parse.urljoin(doc_url, r['url'])
+ r["url"] = urllib.parse.urljoin(doc_url, r["url"])
# filter using adblocker
- resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False]
+ resources = [
+ r
+ for r in resources
+ if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"])
+ is False
+ ]
# remove duplicates
resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
return resources
-
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 806f1e7..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1,31 +1,31 @@
-
# XXX: some broken MRO thing going on in here due to python3 object wrangling
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import datetime
+import gzip
+import http.client
+import json
import os
import sys
import time
-import gzip
-import json
-import requests
-import datetime
import urllib.parse
-import urllib3.exceptions
-from typing import Tuple
from collections import namedtuple
+from http.client import IncompleteRead
+from typing import Any, Dict, List, Optional, Tuple, Union
-import http.client
+import requests
+import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory3
+from wayback.resourcestore import ResourceStore
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
class SandcrawlerBackoffError(Exception):
"""
@@ -34,62 +34,78 @@ class SandcrawlerBackoffError(Exception):
be passed up through any timeout/retry code and become an actual long pause
or crash.
"""
+
pass
-ResourceResult = namedtuple("ResourceResult", [
- "start_url",
- "hit",
- "status",
- "terminal_url",
- "terminal_dt",
- "terminal_status_code",
- "body",
- "cdx",
- "revisit_cdx",
-])
-
-WarcResource = namedtuple("WarcResource", [
- "status_code",
- "location",
- "body",
- "revisit_cdx",
-])
-
-CdxRow = namedtuple('CdxRow', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
- 'warc_csize',
- 'warc_offset',
- 'warc_path',
-])
-
-CdxPartial = namedtuple('CdxPartial', [
- 'surt',
- 'datetime',
- 'url',
- 'mimetype',
- 'status_code',
- 'sha1b32',
- 'sha1hex',
-])
-
-def cdx_partial_from_row(full):
+
+ResourceResult = namedtuple(
+ "ResourceResult",
+ [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+ "revisit_cdx",
+ ],
+)
+
+WarcResource = namedtuple(
+ "WarcResource",
+ [
+ "status_code",
+ "location",
+ "body",
+ "revisit_cdx",
+ ],
+)
+
+CdxRow = namedtuple(
+ "CdxRow",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ "warc_csize",
+ "warc_offset",
+ "warc_path",
+ ],
+)
+
+CdxPartial = namedtuple(
+ "CdxPartial",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ ],
+)
+
+
+def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial:
return CdxPartial(
- surt=full.surt,
- datetime=full.datetime,
- url=full.url,
- mimetype=full.mimetype,
- status_code=full.status_code,
- sha1b32=full.sha1b32,
- sha1hex=full.sha1hex,
+ surt=row.surt,
+ datetime=row.datetime,
+ url=row.url,
+ mimetype=row.mimetype,
+ status_code=row.status_code,
+ sha1b32=row.sha1b32,
+ sha1hex=row.sha1hex,
)
-def cdx_to_dict(cdx):
+
+def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]:
d = {
"surt": cdx.surt,
"datetime": cdx.datetime,
@@ -99,67 +115,82 @@ def cdx_to_dict(cdx):
"sha1b32": cdx.sha1b32,
"sha1hex": cdx.sha1hex,
}
- if type(cdx) == CdxRow and '/' in cdx.warc_path:
- d['warc_csize'] = cdx.warc_csize
- d['warc_offset'] = cdx.warc_offset
- d['warc_path'] = cdx.warc_path
+ if type(cdx) == CdxRow and "/" in cdx.warc_path:
+ d["warc_csize"] = cdx.warc_csize
+ d["warc_offset"] = cdx.warc_offset
+ d["warc_path"] = cdx.warc_path
return d
-def fuzzy_match_url(left, right):
+
+def fuzzy_match_url(left: str, right: str) -> bool:
"""
Matches URLs agnostic of http/https (and maybe other normalizations in the
future)
"""
if left == right:
return True
- if '://' in left and '://' in right:
- left = '://'.join(left.split('://')[1:])
- right = '://'.join(right.split('://')[1:])
+ if "://" in left and "://" in right:
+ left = "://".join(left.split("://")[1:])
+ right = "://".join(right.split("://")[1:])
if left == right:
return True
if left == right + "/" or right == left + "/":
return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
return False
-def test_fuzzy_match_url():
- assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
- assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
- assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
- assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+
+def test_fuzzy_match_url() -> None:
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
# should probably handle these?
- assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
- assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False
+
class CdxApiError(Exception):
pass
-class CdxApiClient:
- def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs):
+class CdxApiClient:
+ def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- cdx_auth_token = kwargs.get('cdx_auth_token',
- os.environ.get('CDX_AUTH_TOKEN'))
+ cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN"))
if not cdx_auth_token:
- raise Exception("CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)")
- self.http_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient',
- 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),
- })
+ raise Exception(
+ "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)"
+ )
+ self.http_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient",
+ "Cookie": "cdx_auth_token={}".format(cdx_auth_token),
+ }
+ )
- def _query_api(self, params):
+ def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]:
"""
Hits CDX API with a query, parses result into a list of CdxRow
"""
resp = self.http_session.get(self.host_url, params=params)
if resp.status_code != 200:
raise CdxApiError(resp.text)
- #print(resp.url, file=sys.stderr)
+ # print(resp.url, file=sys.stderr)
if not resp.text:
return None
rj = resp.json()
@@ -180,8 +211,17 @@ class CdxApiClient:
else:
status_code = int(raw[4])
- # CDX rows with no WARC records?
- if raw[8] == '-' or raw[9] == '-' or raw[10] == '-':
+ # remove CDX rows with no WARC records (?)
+ if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
+ continue
+
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
continue
row = CdxRow(
@@ -200,23 +240,31 @@ class CdxApiClient:
rows.append(row)
return rows
- def fetch(self, url, datetime, filter_status_code=None, retry_sleep=None):
+ def fetch(
+ self,
+ url: str,
+ datetime: str,
+ filter_status_code: Optional[int] = None,
+ retry_sleep: Optional[int] = None,
+ ) -> CdxRow:
"""
Fetches a single CDX row by url/datetime. Raises a KeyError if not
found, because we expect to be looking up a specific full record.
"""
if len(datetime) != 14:
- raise ValueError("CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime))
- params = {
- 'url': url,
- 'from': datetime,
- 'to': datetime,
- 'matchType': 'exact',
- 'limit': 1,
- 'output': 'json',
+ raise ValueError(
+ "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)
+ )
+ params: Dict[str, str] = {
+ "url": url,
+ "from": datetime,
+ "to": datetime,
+ "matchType": "exact",
+ "limit": "1",
+ "output": "json",
}
if filter_status_code:
- params['filter'] = "statuscode:{}".format(filter_status_code)
+ params["filter"] = "statuscode:{}".format(filter_status_code)
resp = self._query_api(params)
if not resp:
if retry_sleep and retry_sleep > 0:
@@ -224,23 +272,43 @@ class CdxApiClient:
if retry_sleep > 3:
next_sleep = retry_sleep - 3
retry_sleep = 3
- print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep
+ )
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
# allow fuzzy http/https match
if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
if retry_sleep and retry_sleep > 0:
- print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
time.sleep(retry_sleep)
- return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
- raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=None
+ )
+ raise KeyError(
+ "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(
+ url, datetime, row
+ )
+ )
if filter_status_code:
assert row.status_code == filter_status_code
return row
- def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None):
+ def lookup_best(
+ self,
+ url: str,
+ max_age_days: Optional[int] = None,
+ best_mimetype: Optional[str] = None,
+ closest: Union[datetime.datetime, str, None] = None,
+ ) -> Optional[CdxRow]:
"""
Fetches multiple CDX rows for the given URL, tries to find the most recent.
@@ -263,42 +331,50 @@ class CdxApiClient:
most-recent
"""
- params = {
- 'url': url,
- 'matchType': 'exact',
- 'limit': -25,
- 'output': 'json',
+ params: Dict[str, str] = {
+ "url": url,
+ "matchType": "exact",
+ "limit": "-40",
+ "output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
#'collapse': 'timestamp:6',
-
# Revisits now allowed and resolved!
#'filter': '!mimetype:warc/revisit',
}
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
- params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day),
+ params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
if closest:
- params['closest'] = closest
- params['sort'] = "closest"
- #print(params, file=sys.stderr)
+ if isinstance(closest, datetime.datetime):
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
+ else:
+ closest_dt = closest
+ params["closest"] = closest_dt
+ params["sort"] = "closest"
+ # print(params, file=sys.stderr)
rows = self._query_api(params)
if not rows:
return None
- def _cdx_sort_key(r):
+ def _cdx_sort_key(r: CdxRow) -> tuple:
"""
This is a function, not a lambda, because it captures
best_mimetype. Will create a tuple that can be used to sort in
*reverse* order.
"""
return (
+ int(r.url == url),
int(r.status_code in (200, 226)),
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
- int('/' in r.warc_path),
+ r.datetime[:4] == closest_dt[:4],
int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
)
rows = sorted(rows, key=_cdx_sort_key)
@@ -308,39 +384,48 @@ class CdxApiClient:
class WaybackError(Exception):
pass
+
class WaybackContentError(Exception):
pass
+
class PetaboxError(Exception):
pass
+
class NoCaptureError(Exception):
pass
-class WaybackClient:
- def __init__(self, cdx_client=None, **kwargs):
+class WaybackClient:
+ def __init__(self, cdx_client: Optional[CdxApiClient] = None, **kwargs):
if cdx_client:
self.cdx_client = cdx_client
else:
self.cdx_client = CdxApiClient()
# /serve/ instead of /download/ doesn't record view count
# this *does* want to be http://, not https://
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
self.petabox_webdata_secret = kwargs.get(
- 'petabox_webdata_secret',
- os.environ.get('PETABOX_WEBDATA_SECRET'),
+ "petabox_webdata_secret",
+ os.environ.get("PETABOX_WEBDATA_SECRET"),
)
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/")
self.rstore = None
self.max_redirects = 25
self.wayback_endpoint = "https://web.archive.org/web/"
self.replay_headers = {
- 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',
+ "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
}
+ self.http_session = requests_retry_session()
+ self.record_http_session = requests_retry_session(
+ status_forcelist=[],
+ )
- def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True):
+ def fetch_petabox(
+ self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
+ ) -> WarcResource:
"""
Fetches wayback resource directly from petabox using WARC path/offset/csize.
@@ -363,33 +448,56 @@ class WaybackClient:
"""
if not self.petabox_webdata_secret:
raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
- if not "/" in warc_path:
- raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
+ if "/" not in warc_path:
+ raise ValueError(
+ "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)
+ )
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory3(
- webdata_secret=self.petabox_webdata_secret,
- ))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory3(
+ webdata_secret=self.petabox_webdata_secret,
+ )
+ )
+ assert self.rstore
try:
- #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
+ # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
- raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
except wayback.exception.InvalidResource:
print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
- raise WaybackContentError("failed to load file contents from wayback/petabox (InvalidResource)")
+ raise WaybackContentError(
+ "failed to load file contents from wayback/petabox (InvalidResource)"
+ )
except urllib3.exceptions.ReadTimeoutError as rte:
- raise PetaboxError("failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(rte))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(
+ rte
+ )
+ )
except ValueError as ve:
- raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)
+ )
except EOFError as eofe:
- raise PetaboxError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)
+ )
except TypeError as te:
- raise PetaboxError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ )
+ )
except Exception as e:
if "while decompressing data: invalid block type" in str(e):
- raise PetaboxError("decompression error fetching WARC record; usually due to bad alexa ARC files")
+ raise PetaboxError(
+ "decompression error fetching WARC record; usually due to bad alexa ARC files"
+ )
else:
raise e
# Note: could consider a generic "except Exception" here, as we get so
@@ -402,7 +510,11 @@ class WaybackClient:
raise WaybackContentError("too many HTTP headers (in wayback fetch)")
location = gwb_record.get_location() or None
- if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit():
+ if (
+ status_code is None
+ and gwb_record.target_uri.startswith(b"ftp://")
+ and not gwb_record.is_revisit()
+ ):
# TODO: some additional verification here?
status_code = 226
@@ -413,16 +525,21 @@ class WaybackClient:
raise WaybackContentError("found revisit record, but won't resolve (loop?)")
revisit_uri, revisit_dt = gwb_record.refers_to
if not (revisit_uri and revisit_dt):
- raise WaybackContentError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
- warc_path, offset))
+ raise WaybackContentError(
+ "revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ warc_path, offset
+ )
+ )
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)
if type(revisit_uri) is bytes:
- revisit_uri = revisit_uri.decode('utf-8')
+ revisit_uri = revisit_uri.decode("utf-8")
if type(revisit_dt) is bytes:
- revisit_dt = revisit_dt.decode('utf-8')
- revisit_dt = revisit_dt.replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ revisit_dt = revisit_dt.decode("utf-8")
+ revisit_dt = (
+ revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "")
+ )
assert len(revisit_dt) == 14
try:
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
@@ -440,10 +557,12 @@ class WaybackClient:
body = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
raise WaybackError(
- "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ )
+ )
elif status_code is None:
- raise WaybackContentError(
- "got a None status_code in (W)ARC record")
+ raise WaybackContentError("got a None status_code in (W)ARC record")
return WarcResource(
status_code=status_code,
location=location,
@@ -451,7 +570,14 @@ class WaybackClient:
revisit_cdx=revisit_cdx,
)
- def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True, expected_status_code=None):
+ def fetch_petabox_body(
+ self,
+ csize: int,
+ offset: int,
+ warc_path: str,
+ resolve_revisit: bool = True,
+ expected_status_code: Optional[int] = None,
+ ) -> bytes:
"""
Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
@@ -468,19 +594,22 @@ class WaybackClient:
if expected_status_code:
if expected_status_code != resource.status_code:
- raise KeyError("archived HTTP response (WARC) was not {}: {}".format(
- expected_status_code,
- resource.status_code,
+ raise KeyError(
+ "archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
)
)
elif resource.status_code not in (200, 226):
- raise KeyError("archived HTTP response (WARC) was not 200: {}".format(
- resource.status_code)
+ raise KeyError(
+ "archived HTTP response (WARC) was not 200: {}".format(resource.status_code)
)
return resource.body
- def fetch_replay_body(self, url, datetime, cdx_sha1hex=None):
+ def fetch_replay_body(
+ self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None
+ ) -> bytes:
"""
Fetches an HTTP 200 record from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -501,46 +630,59 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = requests.get(
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
-
- try:
- resp.raise_for_status()
- except Exception as e:
- raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
# defensively check that this is actually correct replay based on headers
- if not "X-Archive-Src" in resp.headers:
+ if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
if cdx_sha1hex:
# verify that body matches CDX hash
# TODO: don't need *all* these hashes, just sha1
file_meta = gen_file_metadata(resp.content)
- if cdx_sha1hex != file_meta['sha1hex']:
- print(" REPLAY MISMATCH: cdx:{} replay:{}".format(
- cdx_sha1hex,
- file_meta['sha1hex']),
- file=sys.stderr)
- raise WaybackContentError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
- cdx_sha1hex,
- file_meta['sha1hex']),
+ if cdx_sha1hex != file_meta["sha1hex"]:
+ print(
+ " REPLAY MISMATCH: cdx:{} replay:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ file=sys.stderr,
+ )
+ raise WaybackContentError(
+ "replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
)
return resp.content
- def fetch_replay_redirect(self, url, datetime):
+ def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]:
"""
Fetches an HTTP 3xx redirect Location from wayback via the replay interface
(web.archive.org) instead of petabox.
@@ -557,41 +699,65 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = requests.get(
+ # when fetching via `id_`, it is possible to get a 5xx error which
+ # is either a wayback error, or an actual replay of an upstream 5xx
+ # error. the exception control flow here is tweaked, and a
+ # different HTTP session is used, to try and differentiate between
+ # the two cases
+ resp = None
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
)
+ resp.raise_for_status()
except requests.exceptions.TooManyRedirects:
raise WaybackContentError("redirect loop (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
- try:
- resp.raise_for_status()
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
except Exception as e:
+ if resp is not None and "X-Archive-Src" in resp.headers:
+ raise WaybackContentError(
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
+ )
raise WaybackError(str(e))
- #print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if not "X-Archive-Src" in resp.headers:
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
- raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
redirect_url = resp.headers.get("Location")
# eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
- #print(redirect_url, file=sys.stderr)
+ # print(redirect_url, file=sys.stderr)
if redirect_url and redirect_url.startswith("http"):
redirect_url = clean_url(redirect_url)
return redirect_url
else:
return None
- def lookup_resource(self, start_url, best_mimetype=None, closest=None):
+ def lookup_resource(
+ self,
+ start_url: str,
+ best_mimetype: Optional[str] = None,
+ closest: Union[str, datetime.datetime, None] = None,
+ ) -> ResourceResult:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. Returns a ResourceResult object, which may indicate a
@@ -617,11 +783,13 @@ class WaybackClient:
"""
next_url = start_url
urls_seen = [start_url]
- for i in range(self.max_redirects):
+ for i in range(self.max_redirects + 1):
print(" URL: {}".format(next_url), file=sys.stderr)
- cdx_row = self.cdx_client.lookup_best(next_url, best_mimetype=best_mimetype, closest=closest)
- #print(cdx_row, file=sys.stderr)
- if not cdx_row:
+ next_row: Optional[CdxRow] = self.cdx_client.lookup_best(
+ next_url, best_mimetype=best_mimetype, closest=closest
+ )
+ # print(next_row, file=sys.stderr)
+ if not next_row:
return ResourceResult(
start_url=start_url,
hit=False,
@@ -634,8 +802,10 @@ class WaybackClient:
revisit_cdx=None,
)
+ cdx_row: CdxRow = next_row
+
# first try straight-forward redirect situation
- if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path:
+ if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -648,15 +818,17 @@ class WaybackClient:
status="success",
terminal_url=cdx_row.url,
terminal_dt=cdx_row.datetime,
- terminal_status_code=resource.revisit_cdx.status_code, # ?
+ terminal_status_code=resource.revisit_cdx.status_code,
body=resource.body,
cdx=cdx_row,
revisit_cdx=resource.revisit_cdx,
)
+ # else, continue processing with revisit record
if cdx_row.status_code in (200, 226):
revisit_cdx = None
- if '/' in cdx_row.warc_path:
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -669,7 +841,7 @@ class WaybackClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- cdx_row = cdx_partial_from_row(cdx_row)
+ final_cdx = cdx_partial_from_row(cdx_row)
return ResourceResult(
start_url=start_url,
hit=True,
@@ -678,11 +850,11 @@ class WaybackClient:
terminal_dt=cdx_row.datetime,
terminal_status_code=cdx_row.status_code,
body=body,
- cdx=cdx_row,
+ cdx=final_cdx,
revisit_cdx=revisit_cdx,
)
elif 300 <= (cdx_row.status_code or 0) < 400:
- if '/' in cdx_row.warc_path:
+ if "/" in cdx_row.warc_path:
resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
@@ -703,21 +875,22 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- if not "://" in resource.location:
+ if "://" not in resource.location:
next_url = urllib.parse.urljoin(next_url, resource.location)
else:
next_url = resource.location
if next_url:
next_url = clean_url(next_url)
else:
- next_url = self.fetch_replay_redirect(
+ redirect_url = self.fetch_replay_redirect(
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- if next_url:
- next_url = clean_url(next_url)
- cdx_row = cdx_partial_from_row(cdx_row)
- if not next_url:
+ if redirect_url:
+ redirect_url = clean_url(redirect_url)
+ if redirect_url:
+ next_url = redirect_url
+ else:
print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
@@ -756,6 +929,7 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
+
return ResourceResult(
start_url=start_url,
hit=False,
@@ -772,39 +946,72 @@ class WaybackClient:
class SavePageNowError(Exception):
pass
+
class SavePageNowBackoffError(SandcrawlerBackoffError):
pass
-SavePageNowResult = namedtuple('SavePageNowResult', [
- 'success',
- 'status',
- 'job_id',
- 'request_url',
- 'terminal_url',
- 'terminal_dt',
- 'resources',
-])
-class SavePageNowClient:
+SavePageNowResult = namedtuple(
+ "SavePageNowResult",
+ [
+ "success",
+ "status",
+ "job_id",
+ "request_url",
+ "terminal_url",
+ "terminal_dt",
+ "resources",
+ ],
+)
- def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs):
- self.ia_access_key = kwargs.get('ia_access_key',
- os.environ.get('IA_ACCESS_KEY'))
- self.ia_secret_key = kwargs.get('ia_secret_key',
- os.environ.get('IA_SECRET_KEY'))
+
+class SavePageNowClient:
+ def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs):
+ self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
+ self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
self.v2endpoint = v2endpoint
- self.v2_session = requests_retry_session(retries=5, backoff_factor=3)
- self.v2_session.headers.update({
- 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient',
- 'Accept': 'application/json',
- 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key),
- })
+ self.v2_session = requests_retry_session(
+ retries=5, backoff_factor=3, status_forcelist=[502, 504]
+ )
+ self.v2_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
+ "Accept": "application/json",
+ "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key),
+ }
+ )
# 3 minutes total
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0):
+ self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0)
+
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ def save_url_now_v2(
+ self,
+ request_url: str,
+ force_simple_get: Optional[int] = None,
+ capture_outlinks: int = 0,
+ ) -> SavePageNowResult:
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -838,86 +1045,163 @@ class SavePageNowClient:
None,
None,
)
- resp = self.v2_session.post(
- self.v2endpoint,
- data={
- 'url': request_url,
- 'capture_all': 1,
- 'capture_outlinks': capture_outlinks,
- 'capture_screenshot': 0,
- 'if_not_archived_within': '1d',
- 'force_get': force_simple_get,
- 'skip_first_archive': 1,
- 'outlinks_availability': 0,
- 'js_behavior_timeout': 0,
- },
- )
+ if force_simple_get is None:
+ force_simple_get = 0
+ for domain in self.simple_get_domains:
+ if domain in request_url:
+ force_simple_get = 1
+ break
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get(f"{self.v2endpoint}/status/user")
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
if resp.status_code == 429:
- raise SavePageNowBackoffError("status_code: {}, url: {}".format(resp.status_code, request_url))
+ raise SavePageNowBackoffError(
+ "status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
elif resp.status_code != 200:
- raise SavePageNowError("SPN2 status_code: {}, url: {}".format(resp.status_code, request_url))
+ raise SavePageNowError(
+ "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
+ resp.raise_for_status()
resp_json = resp.json()
- if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json['message']:
- raise SavePageNowBackoffError(resp_json['message'])
- elif not resp_json or 'job_id' not in resp_json:
+ if (
+ resp_json
+ and "message" in resp_json
+ and "You have already reached the limit of active sessions" in resp_json["message"]
+ ):
+ raise SavePageNowBackoffError(resp_json["message"])
+ elif (
+ resp_json
+ and "message" in resp_json
+ and "The same snapshot had been made" in resp_json["message"]
+ ):
+ return SavePageNowResult(
+ False,
+ "spn2-recent-capture",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif resp_json.get("status") == "error":
+ return SavePageNowResult(
+ False,
+ resp_json.get("status_ext") or resp_json["status"],
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
raise SavePageNowError(
- "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json))
+ "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
+ )
- job_id = resp_json['job_id']
+ job_id = resp_json["job_id"]
print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
# poll until complete
final_json = None
for i in range(self.poll_count):
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, resp_json['job_id']))
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id))
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
- status = resp.json()['status']
- if status == 'pending':
+ status = resp.json()["status"]
+ if status == "pending":
time.sleep(self.poll_seconds)
- elif status in ('success', 'error'):
+ elif status in ("success", "error"):
final_json = resp.json()
break
else:
- raise SavePageNowError("Unknown SPN2 status:{} url:{}".format(status, request_url))
+ raise SavePageNowError(
+ "Unknown SPN2 status:{} url:{}".format(status, request_url)
+ )
if not final_json:
raise SavePageNowError("SPN2 timed out (polling count exceeded)")
# if there was a recent crawl of same URL, fetch the status of that
# crawl to get correct datetime
- if final_json.get('original_job_id'):
- print(f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", file=sys.stderr)
- resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))
+ if final_json.get("original_job_id"):
+ print(
+ f" SPN recent capture: {job_id} -> {final_json['original_job_id']}",
+ file=sys.stderr,
+ )
+ resp = self.v2_session.get(
+ "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"])
+ )
try:
resp.raise_for_status()
- except:
+ except Exception:
raise SavePageNowError(resp.content)
final_json = resp.json()
- #print(final_json, file=sys.stderr)
+ # print(final_json, file=sys.stderr)
- if final_json['status'] == "success":
- if final_json.get('original_url').startswith('/'):
- print(f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", file=sys.stderr)
+ if final_json["status"] == "success":
+ if final_json.get("original_url").startswith("/"):
+ print(
+ f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
+ file=sys.stderr,
+ )
return SavePageNowResult(
True,
"success",
job_id,
request_url,
- final_json['original_url'],
- final_json['timestamp'],
- final_json['resources'],
+ final_json["original_url"],
+ final_json["timestamp"],
+ final_json.get("resources") or None,
)
else:
- if final_json['status'] == 'pending':
- final_json['status'] = 'error:pending'
+ if final_json["status"] == "pending":
+ final_json["status"] = "error:pending"
return SavePageNowResult(
False,
- final_json.get('status_ext') or final_json['status'],
+ final_json.get("status_ext") or final_json["status"],
job_id,
request_url,
None,
@@ -925,24 +1209,38 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
+ def crawl_resource(
+ self,
+ start_url: str,
+ wayback_client: WaybackClient,
+ force_simple_get: Optional[int] = None,
+ ) -> ResourceResult:
"""
- Runs a SPN2 crawl, then fetches body from wayback.
+ Runs a SPN2 crawl, then fetches body.
- TODO: possible to fetch from petabox?
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
"""
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
- if 'gzbd.cnki.net/' in start_url:
- spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get, capture_outlinks=1)
+ if "gzbd.cnki.net/" in start_url:
+ spn_result = self.save_url_now_v2(
+ start_url, force_simple_get=force_simple_get, capture_outlinks=1
+ )
else:
spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
if not spn_result.success:
status = spn_result.status
- if status in ("error:invalid-url", "error:not-found",
- "error:invalid-host-resolution", "error:gateway-timeout",
- "error:too-many-redirects", "error:read-timeout"):
+ if status in (
+ "error:invalid-url",
+ "error:not-found",
+ "error:invalid-host-resolution",
+ "error:gateway-timeout",
+ "error:too-many-redirects",
+ "error:read-timeout",
+ ):
status = status.replace("error:", "")
elif status in ("error:no-access", "error:forbidden"):
status = "forbidden"
@@ -953,7 +1251,10 @@ class SavePageNowClient:
elif status.startswith("error:"):
status = "spn2-" + status
# despite other errors, call these a failure (so we don't retry)
- if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")):
+ if spn_result.terminal_url and (
+ spn_result.terminal_url.endswith("/cookieAbsent")
+ or spn_result.terminal_url.endswith("cookieSet=1")
+ ):
status = "blocked-cookie"
return ResourceResult(
start_url=start_url,
@@ -966,10 +1267,10 @@ class SavePageNowClient:
cdx=None,
revisit_cdx=None,
)
- #print(spn_result, file=sys.stderr)
+ # print(spn_result, file=sys.stderr)
# detect partial URL response (aka, success, but missing full URL)
- if not "://" in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
+ if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -983,7 +1284,9 @@ class SavePageNowClient:
)
# don't try to CDX fetch for this common cookie block terminal
- if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
+ if spn_result.terminal_url.endswith(
+ "/cookieAbsent"
+ ) or spn_result.terminal_url.endswith("cookieSet=1"):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -996,7 +1299,7 @@ class SavePageNowClient:
revisit_cdx=None,
)
- cdx_row = None
+ cdx_row: Optional[CdxRow] = None
# hack to work around elsevier weirdness
if "://pdf.sciencedirectassets.com/" in spn_result.request_url:
elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
@@ -1008,7 +1311,7 @@ class SavePageNowClient:
cdx_row = elsevier_pdf_cdx
else:
print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
- #print(elsevier_pdf_cdx, file=sys.stderr)
+ # print(elsevier_pdf_cdx, file=sys.stderr)
if not cdx_row:
# lookup exact
@@ -1020,8 +1323,17 @@ class SavePageNowClient:
url=spn_result.terminal_url,
datetime=spn_result.terminal_dt,
filter_status_code=filter_status_code,
- retry_sleep=9.0,
+ retry_sleep=self.spn_cdx_retry_sec,
)
+ # sometimes there are fuzzy http/https self-redirects with the
+ # same SURT; try to work around that
+ if cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
except KeyError as ke:
print(" CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(
@@ -1036,10 +1348,11 @@ class SavePageNowClient:
revisit_cdx=None,
)
- #print(cdx_row, file=sys.stderr)
+ # print(cdx_row, file=sys.stderr)
revisit_cdx = None
- if '/' in cdx_row.warc_path:
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
# Usually can't do this kind of direct fetch because CDX result is recent/live
resource = wayback_client.fetch_petabox(
csize=cdx_row.warc_csize,
@@ -1057,7 +1370,7 @@ class SavePageNowClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- except (WaybackError, WaybackContentError) as we:
+ except (WaybackError, WaybackContentError):
return ResourceResult(
start_url=start_url,
hit=False,
@@ -1070,24 +1383,48 @@ class SavePageNowClient:
revisit_cdx=None,
)
# warc_path etc will change, so strip them out
- cdx_row = cdx_partial_from_row(cdx_row)
+ final_cdx = cdx_partial_from_row(cdx_row)
- return ResourceResult(
- start_url=start_url,
- hit=True,
- status="success",
- terminal_url=cdx_row.url,
- terminal_dt=cdx_row.datetime,
- terminal_status_code=cdx_row.status_code,
- body=body,
- cdx=cdx_row,
- revisit_cdx=revisit_cdx,
- )
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
- if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
- print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+def fix_transfer_encoding(
+ file_meta: dict, resource: ResourceResult
+) -> Tuple[dict, ResourceResult]:
+ if (
+ resource.body
+ and file_meta["mimetype"] == "application/gzip"
+ and resource.cdx
+ and resource.cdx.mimetype != "application/gzip"
+ ):
+ print(
+ " transfer encoding not stripped: {}".format(resource.cdx.mimetype),
+ file=sys.stderr,
+ )
inner_body = gzip.decompress(resource.body)
if not inner_body:
raise Exception("null body inside transfer encoding")
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
deleted file mode 100644
index abcc156..0000000
--- a/python/sandcrawler/ingest.py
+++ /dev/null
@@ -1,754 +0,0 @@
-
-import sys
-import json
-import gzip
-import time
-import base64
-import xml.etree.ElementTree
-from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
-from http.server import BaseHTTPRequestHandler, HTTPServer
-
-import requests
-from selectolax.parser import HTMLParser
-
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_ingest import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
-from sandcrawler.xml import xml_reserialize
-
-
-class IngestFileWorker(SandcrawlerWorker):
- """
- High level flow is to look in history first, then go to live web if
- resource not found. Following redirects is treated as "fetching a
- resource". Current version fetches a single resource; if it isn't a hit
- but is an HTML 200, treats it as a landing page, tries to extract
- fulltext link, then fetches that resource.
-
- process(request, key=None) -> response
- Does all the things!
-
- Check existing processing (short circuit):
-
- check_existing_ingest(base_url) -> ingest_file_result or none
- process_existing(result) -> response
- try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()
-
- Fetch resource:
-
- find_resource(url) -> ResourceResult
-
- Process resource:
-
- process_hit(ResourceResult) -> response
- process_grobid(ResourceResult)
- """
-
- def __init__(self, sink=None, **kwargs):
- super().__init__()
-
- self.sink = sink
- self.wayback_client = kwargs.get('wayback_client')
- if not self.wayback_client:
- self.wayback_client = WaybackClient()
- self.spn_client = kwargs.get('spn_client')
- if not self.spn_client:
- self.spn_client = SavePageNowClient()
- self.grobid_client = kwargs.get('grobid_client')
- if not self.grobid_client:
- self.grobid_client = GrobidClient()
- self.pgrest_client = kwargs.get('pgrest_client')
- if not self.pgrest_client:
- self.pgrest_client = SandcrawlerPostgrestClient()
- self.grobid_sink = kwargs.get('grobid_sink')
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
- self.pdftext_sink = kwargs.get('pdftext_sink')
- self.xmldoc_sink = kwargs.get('xmldoc_sink')
- self.htmlteixml_sink = kwargs.get('htmlteixml_sink')
- self.max_hops = 6
-
- self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
- self.try_existing_grobid = kwargs.get('try_existing_grobid', True)
- self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True)
- self.try_wayback = kwargs.get('try_wayback', True)
- self.try_spn2 = kwargs.get('try_spn2', True)
- self.html_quick_mode = kwargs.get('html_quick_mode', False)
- self.adblock_rules = load_adblock_rules()
- self.max_html_resources = 200
-
- self.base_url_blocklist = [
- # robot blocking
- "://hkvalidate.perfdrive.com/",
-
- # temporary, until we implement specific fetch and 'petabox' output
- "://archive.org/",
- "://www.archive.org/",
- "://web.archive.org/web/",
- "://openlibrary.org/",
- "://www.openlibrary.org/",
- "://fatcat.wiki/",
-
- # Domain squats
- "://bartandjones.com",
- "://ijretm.com",
- "://ijrcemas.com",
- "://jist.net.in",
- "://croisements-revue.org",
-
- # all stubs/previews, not full papers
- "://page-one.live.cf.public.springer.com",
-
- # large datasets-only (no PDF expected)
- "plutof.ut.ee/",
- "www.gbif.org/",
- "doi.pangaea.de/",
- "www.plate-archive.org/",
- "://doi.org/10.25642/ipk/gbis/",
- "://apex.ipk-gatersleben.de/",
-
- # Historical non-paper content:
- "dhz.uni-passau.de/", # newspapers
- "digital.ucd.ie/", # ireland national historical
-
- # DOI prefixes
- "://doi.org/10.2307/", # JSTOR; slow and many redirects
- ]
-
- self.wall_blocklist = [
- # loginwall
- "://profile.thieme.de/HTML/sso/ejournals/login.htm",
- "://login.bepress.com/"
- "?SAMLRequest="
- ]
-
- # these are special-case web domains for which we want SPN2 to not run
- # a headless browser (brozzler), but instead simply run wget.
- # the motivation could be to work around browser issues, or in the
- # future possibly to increase download efficiency (wget/fetch being
- # faster than browser fetch)
- self.spn2_simple_get_domains = [
- # direct PDF links
- "://arxiv.org/pdf/",
- "://europepmc.org/backend/ptpmcrender.fcgi",
- "://pdfs.semanticscholar.org/",
- "://res.mdpi.com/",
-
- # platform sites
- "://zenodo.org/",
- "://figshare.org/",
- "://springernature.figshare.com/",
-
- # popular simple cloud storage or direct links
- "://s3-eu-west-1.amazonaws.com/",
- ]
-
-
- def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
- """
- Check in sandcrawler-db (postgres) to see if we have already ingested
- this URL (ingest file result table).
-
- Returns existing row *if* found *and* we should use it, otherwise None.
-
- Looks at existing ingest results and makes a decision based on, eg,
- status and timestamp.
- """
- if not self.try_existing_ingest:
- return None
- existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
- # TODO: filter on more flags?
- if existing and existing['hit'] == True:
- return existing
- else:
- return None
-
- def find_resource(self, url, best_mimetype=None, force_recrawl=False) -> Optional[ResourceResult]:
- """
- Looks in wayback for a resource starting at the URL, following any
- redirects. If a hit isn't found, try crawling with SPN.
- """
- via = "none"
- resource = None
-
- if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"):
- raise NotImplementedError("handling direct wayback links not supported yet")
-
- if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
- raise NotImplementedError("fetching from archive.org not implemented yet")
-
- if self.try_wayback and not force_recrawl:
- via = "wayback"
- resource = self.wayback_client.lookup_resource(url, best_mimetype)
-
- # check for "soft 404" conditions, where we should retry with live SPNv2
- soft404 = False
- # NOTE: these are often not working with SPNv2 either, so disabling. If
- # we really want to try again, should do force-recrawl
- #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
- # soft404 = True
-
- old_failure = False
- if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
- old_failure = True
-
- if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
- via = "spn2"
- force_simple_get = 0
- for domain in self.spn2_simple_get_domains:
- if domain in url:
- force_simple_get = 1
- break
- resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
- print("[FETCH {:>6}] {} {}".format(
- via,
- (resource and resource.status),
- (resource and resource.terminal_url) or url),
- file=sys.stderr)
- return resource
-
- def process_existing(self, request: dict, result_row: dict) -> dict:
- """
- If we have an existing ingest file result, do any database fetches or
- additional processing necessary to return a result.
- """
- raise NotImplementedError("process_existing() not tested or safe yet")
- assert result_row['hit']
- existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
- existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
- if not (existing_file_meta and existing_grobid and existing_cdx):
- raise NotImplementedError("partially-exsiting records not implemented yet")
- result = {
- 'hit': result_row['hit'],
- 'status': "existing",
- 'request': request,
- 'grobid': existing_grobid,
- 'file_meta': existing_file_meta,
- 'cdx': existing_cdx,
- 'terminal': {
- 'terminal_url': result_row['terminal_url'],
- 'terminal_dt': result_row['terminal_dt'],
- 'terminal_status_code': result_row['terminal_status_code'],
- 'terminal_sha1hex': result_row['terminal_sha1hex'],
- },
- }
- return result
-
- def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Run all the necessary processing for a new/fresh ingest hit.
- """
- if ingest_type == "pdf":
- return {
- 'grobid': self.process_grobid(resource, file_meta),
- 'pdf_meta': self.process_pdfextract(resource, file_meta),
- }
- elif ingest_type == "xml":
- return {
- 'xml_meta': self.process_xml(resource, file_meta),
- }
- elif ingest_type == "html":
- html_info = self.process_html(resource, file_meta)
- # if there is no html_biblio, don't clobber anything possibly extracted earlier
- if 'html_biblio' in html_info and not html_info['html_biblio']:
- html_info.pop('html_biblio')
- return html_info
- else:
- raise NotImplementedError(f"process {ingest_type} hit")
-
- def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Submits to resource body to GROBID for processing.
-
- TODO: By default checks sandcrawler-db for an existing row first, then
- decide if we should re-process
- """
- if self.try_existing_grobid:
- existing = self.pgrest_client.get_grobid(file_meta['sha1hex'])
- if existing:
- print("found existing GROBID result", file=sys.stderr)
- return existing
-
- # Need to actually processes
- result = self.grobid_client.process_fulltext(resource.body)
- if self.grobid_sink:
- # extra fields for GROBID kafka messages
- result['file_meta'] = file_meta
- result['key'] = result['file_meta']['sha1hex']
- self.grobid_sink.push_record(result.copy())
- if result['status'] == "success":
- metadata = self.grobid_client.metadata(result)
- if metadata:
- result['metadata'] = self.grobid_client.metadata(result)
- result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
- result['grobid_version'] = result['metadata'].pop('grobid_version', None)
- result.pop('tei_xml', None)
- result.pop('file_meta', None)
- result.pop('key', None)
- return result
-
- def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Extracts thumbnail and pdf_meta info from PDF.
-
- By default checks sandcrawler-db for an existing row first, then decide
- if we should re-process.
-
- TODO: difference between Kafka schema and SQL/postgrest schema
- """
- if self.try_existing_pdfextract:
- existing = self.pgrest_client.get_pdf_meta(file_meta['sha1hex'])
- if existing:
- print("found existing pdf_meta result", file=sys.stderr)
- result = PdfExtractResult.from_pdf_meta_dict(existing)
- return result.to_pdftext_dict()
-
- # Need to actually processes
- result = process_pdf(resource.body)
- assert result.file_meta['sha1hex'] == file_meta['sha1hex']
- if self.thumbnail_sink and result.page0_thumbnail is not None:
- self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
- if self.pdftext_sink:
- self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
- result.page0_thumbnail = None
- result.text = None
- result.file_meta = None
- return result.to_pdftext_dict()
-
- def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
- """
- Simply publishes to Kafka topic.
-
- In the future, could extract other metadata here (like body word
- count), or attempting to fetch sub-resources.
- """
- if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml":
- try:
- jats_xml = xml_reserialize(resource.body)
- except xml.etree.ElementTree.ParseError:
- return dict(status="xml-parse-error")
- msg = dict(
- sha1hex=file_meta["sha1hex"],
- status="success",
- jats_xml=jats_xml,
- )
- self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex'])
- return dict(status="success")
-
- def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
-
- assert resource.body
- try:
- html_doc = HTMLParser(resource.body)
- except ValueError as ve:
- return dict(
- status="html-selectolax-error",
- )
- html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
- assert html_biblio
- html_body = html_extract_body_teixml(resource.body)
- html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
- html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
- html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
-
- if html_scope in ('blocked-captcha','blocked-cookie','blocked-forbidden'):
- return dict(
- status=html_scope,
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- )
- elif html_scope == 'unknown':
- html_body.pop("tei_xml", None)
- return dict(
- status="unknown-scope",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
- elif html_scope not in ('article-fulltext',):
- html_body.pop("tei_xml", None)
- return dict(
- status="wrong-scope",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
-
- raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules)
- if len(raw_resources) > self.max_html_resources:
- html_body.pop("tei_xml", None)
- return dict(
- status="too-many-resources",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
-
- if self.htmlteixml_sink and html_body['status'] == "success":
- self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])
-
- html_body.pop("tei_xml", None)
-
- partial_result = dict(
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
-
- when = parse_cdx_datetime(resource.cdx.datetime)
- full_resources: List[WebResource] = []
-
- try:
- if self.html_quick_mode:
- print(" WARN: running quick CDX-only fetches", file=sys.stderr)
- full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when)
- else:
- full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
- except PetaboxError as e:
- partial_result['status'] = 'petabox-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except CdxApiError as e:
- partial_result['status'] = 'cdx-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except WaybackError as e:
- partial_result['status'] = 'wayback-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except WaybackContentError as e:
- partial_result['status'] = 'wayback-content-error'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
- except NoCaptureError as e:
- partial_result['status'] = 'html-resource-no-capture'
- partial_result['error_message'] = str(e)[:1600]
- return partial_result
-
- return dict(
- html_body=html_body,
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
- )
-
- def timeout_response(self, task: dict) -> dict:
- print("[TIMEOUT]", file=sys.stderr)
- return dict(
- request=task,
- hit=False,
- status="timeout",
- error_message="ingest worker internal timeout",
- )
-
- def want(self, request: dict) -> bool:
- if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html'):
- return False
- return True
-
- def process(self, request: dict, key: Any = None) -> dict:
-
- # old backwards compatibility
- if request.get('ingest_type') == 'file':
- request['ingest_type'] = 'pdf'
-
- ingest_type = request.get('ingest_type')
- if ingest_type not in ("pdf", "xml", "html"):
- raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
-
- # parse/clean URL
- # note that we pass through the original/raw URL, and that is what gets
- # persisted in database table
- base_url = clean_url(request['base_url'])
-
- force_recrawl = bool(request.get('force_recrawl', False))
-
- for block in self.base_url_blocklist:
- if block in base_url:
- print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
- return dict(request=request, hit=False, status="skip-url-blocklist")
-
- print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
-
- best_mimetype = None
- if ingest_type == "pdf":
- best_mimetype = "application/pdf"
- elif ingest_type == "xml":
- best_mimetype = "text/xml"
- elif ingest_type == "html":
- best_mimetype = "text/html"
-
- existing = self.check_existing_ingest(ingest_type, base_url)
- if existing:
- return self.process_existing(request, existing)
-
- result: Dict[str, Any] = dict(request=request, hit=False)
-
- next_url = base_url
- hops = [base_url]
-
- while len(hops) <= self.max_hops:
-
- result['hops'] = hops
-
- # check against blocklist again on each hop
- for block in self.base_url_blocklist:
- if block in next_url:
- result['status'] = "skip-url-blocklist"
- return result
-
- # check against known loginwall URLs
- for block in self.wall_blocklist:
- if block in next_url:
- result['status'] = "skip-wall"
- return result
-
- # check for popular cookie blocking URL patterns. On successful SPN
- # crawls, shouldn't see these redirect URLs
- if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url:
- result['status'] = 'blocked-cookie'
- return result
-
- try:
- resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
- except SavePageNowError as e:
- result['status'] = 'spn2-error'
- result['error_message'] = str(e)[:1600]
- return result
- except PetaboxError as e:
- result['status'] = 'petabox-error'
- result['error_message'] = str(e)[:1600]
- return result
- except CdxApiError as e:
- result['status'] = 'cdx-error'
- result['error_message'] = str(e)[:1600]
- # add a sleep in cdx-error path as a slow-down
- time.sleep(2.0)
- return result
- except WaybackError as e:
- result['status'] = 'wayback-error'
- result['error_message'] = str(e)[:1600]
- return result
- except WaybackContentError as e:
- result['status'] = 'wayback-content-error'
- result['error_message'] = str(e)[:1600]
- return result
- except NotImplementedError as e:
- result['status'] = 'not-implemented'
- result['error_message'] = str(e)[:1600]
- return result
-
- assert resource
-
- if resource.terminal_url:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
- if resource.terminal_url not in result['hops']:
- result['hops'].append(resource.terminal_url)
-
- if not resource.hit:
- result['status'] = resource.status
- return result
-
- if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url):
- result['status'] = 'blocked-cookie'
- return result
-
- if not resource.body:
- result['status'] = 'null-body'
- return result
-
- file_meta = gen_file_metadata(resource.body)
- try:
- file_meta, resource = fix_transfer_encoding(file_meta, resource)
- except Exception as e:
- result['status'] = 'bad-gzip-encoding'
- result['error_message'] = str(e)
- return result
-
- if not resource.body or file_meta['size_bytes'] == 0:
- result['status'] = 'null-body'
- return result
-
- # here we split based on ingest type to try and extract a next hop
- html_ish_resource = bool(
- "html" in file_meta['mimetype']
- or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
- or "application/xml" in file_meta['mimetype']
- or "text/xml" in file_meta['mimetype']
- )
- html_biblio = None
- html_doc = None
- if html_ish_resource and resource.body:
- try:
- html_doc = HTMLParser(resource.body)
- html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
- if html_biblio:
- if not 'html_biblio' in result or html_biblio.title:
- result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
- #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
- except ValueError:
- pass
-
- if ingest_type == "pdf" and html_ish_resource:
-
- # the new style of URL extraction (already computed)
- if html_biblio and html_biblio.pdf_fulltext_url:
- fulltext_url = dict(
- pdf_url=html_biblio.pdf_fulltext_url,
- technique="html_biblio",
- )
- else:
- fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
- result['extract_next_hop'] = fulltext_url
- if not fulltext_url:
- result['status'] = 'no-pdf-link'
- return result
- next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') or ""
- assert next_url
- next_url = clean_url(next_url)
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- fulltext_url.get('technique'),
- next_url,
- ),
- file=sys.stderr)
- if next_url in hops:
- result['status'] = 'link-loop'
- result['error_message'] = "repeated: {}".format(next_url)
- return result
- hops.append(next_url)
- continue
- elif ingest_type == "xml" and html_ish_resource:
- if html_biblio and html_biblio.xml_fulltext_url:
- next_url = html_biblio.xml_fulltext_url
- technique = "html_biblio"
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- technique,
- next_url,
- ),
- file=sys.stderr)
- if next_url in hops:
- result['status'] = 'link-loop'
- result['error_message'] = "repeated: {}".format(next_url)
- return result
- hops.append(next_url)
- continue
- elif ingest_type == "html" and html_ish_resource:
- if html_biblio and html_biblio.html_fulltext_url:
- next_url = html_biblio.html_fulltext_url
- technique = "html_biblio"
- if next_url in hops:
- # for HTML ingest, we don't count this as a link-loop
- break
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- technique,
- next_url,
- ),
- file=sys.stderr)
- hops.append(next_url)
- continue
-
- # default is to NOT keep hopping
- break
-
- if len(hops) >= self.max_hops:
- result['status'] = "max-hops-exceeded"
- return result
-
- # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
- assert resource
- assert resource.hit == True
- assert resource.terminal_status_code in (200, 226)
-
- if resource.terminal_url:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- "terminal_sha1hex": file_meta['sha1hex'],
- }
-
- result['file_meta'] = file_meta
- result['cdx'] = cdx_to_dict(resource.cdx)
- if resource.revisit_cdx:
- result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
-
- if ingest_type == "pdf":
- if file_meta['mimetype'] != "application/pdf":
- result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
- return result
- elif ingest_type == "xml":
- if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
- result['status'] = "wrong-mimetype"
- return result
- elif ingest_type == "html":
- if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
- result['status'] = "wrong-mimetype"
- return result
- else:
- raise NotImplementedError()
-
- info = self.process_hit(ingest_type, resource, file_meta)
- result.update(info)
-
- # check if processing turned up an error
- if info.get('status') not in ('success', None):
- result['status'] = info['status']
- return result
-
- result['status'] = "success"
- result['hit'] = True
- if ingest_type == "pdf":
- print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
- ingest_type,
- result.get('file_meta', {}).get('sha1hex'),
- result.get('grobid', {}).get('status_code'),
- result.get('pdf_meta', {}).get('status'),
- ),
- file=sys.stderr)
- else:
- print("[SUCCESS {:>5}] sha1:{}".format(
- ingest_type,
- result.get('file_meta', {}).get('sha1hex'),
- ),
- file=sys.stderr)
- return result
-
-
-class IngestFileRequestHandler(BaseHTTPRequestHandler):
- def do_POST(self):
- if self.path != "/ingest":
- self.send_response(404)
- self.end_headers()
- self.wfile.write("404: Not Found")
- return
- length = int(self.headers.get('content-length'))
- request = json.loads(self.rfile.read(length).decode('utf-8'))
- print("Got request: {}".format(request))
- ingester = IngestFileWorker()
- result = ingester.process(request)
- self.send_response(200)
- self.end_headers()
- self.wfile.write(json.dumps(result))
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
new file mode 100644
index 0000000..03277f8
--- /dev/null
+++ b/python/sandcrawler/ingest_file.py
@@ -0,0 +1,925 @@
+import json
+import sys
+import time
+import xml.etree.ElementTree
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Dict, List, Optional
+
+from selectolax.parser import HTMLParser
+
+from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import (
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiError,
+ NoCaptureError,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_html import (
+ WebResource,
+ fetch_html_resources,
+ html_extract_body_teixml,
+ html_guess_platform,
+ html_guess_scope,
+ quick_fetch_html_resources,
+)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.xml import xml_reserialize
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFileWorker(SandcrawlerWorker):
+ """
+ High level flow is to look in history first, then go to live web if
+ resource not found. Following redirects is treated as "fetching a
+ resource". Current version fetches a single resource; if it isn't a hit
+ but is an HTML 200, treats it as a landing page, tries to extract
+ fulltext link, then fetches that resource.
+
+ process(request, key=None) -> response
+ Does all the things!
+
+ Check existing processing (short circuit):
+
+ check_existing_ingest(base_url) -> ingest_file_result or none
+ process_existing(result) -> response
+ try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()
+
+ Fetch resource:
+
+ find_resource(url) -> ResourceResult
+
+ Process resource:
+
+ process_file_hit(ResourceResult) -> response
+ process_grobid(ResourceResult)
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__()
+
+ self.sink = sink
+
+ if kwargs.get("wayback_client"):
+ self.wayback_client: WaybackClient = kwargs["wayback_client"]
+ else:
+ self.wayback_client = WaybackClient()
+
+ if kwargs.get("spn_client"):
+ self.spn_client: SavePageNowClient = kwargs["spn_client"]
+ else:
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+
+ if kwargs.get("grobid_client"):
+ self.grobid_client: GrobidClient = kwargs["grobid_client"]
+ else:
+ self.grobid_client = GrobidClient()
+
+ if kwargs.get("pgrest_client"):
+ self.pgrest_client: SandcrawlerPostgrestClient = kwargs["pgrest_client"]
+ else:
+ self.pgrest_client = SandcrawlerPostgrestClient()
+
+ self.grobid_sink = kwargs.get("grobid_sink")
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
+ self.pdftext_sink = kwargs.get("pdftext_sink")
+ self.xmldoc_sink = kwargs.get("xmldoc_sink")
+ self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
+ self.max_hops = 8
+
+ self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
+ self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
+ self.try_existing_pdfextract = kwargs.get("try_existing_pdfextract", True)
+ self.try_wayback = kwargs.get("try_wayback", True)
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.html_quick_mode = kwargs.get("html_quick_mode", False)
+ self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
+
+ self.base_url_blocklist = [
+ "://localhost/",
+ "://127.0.0.1/",
+ # robot blocking / rate-limited
+ "://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
+ # temporary, until we implement specific fetch and 'petabox' output
+ "://archive.org/",
+ "://www.archive.org/",
+ "://web.archive.org/web/",
+ # out of scope
+ "://openlibrary.org/",
+ "://www.openlibrary.org/",
+ "://fatcat.wiki/",
+ "://scholar.archive.org/",
+ "://orcid.org/",
+ # Domain squats
+ "://bartandjones.com",
+ "://ijretm.com",
+ "://ijrcemas.com",
+ "://jist.net.in",
+ "://croisements-revue.org",
+ # all stubs/previews, not full papers
+ "://page-one.live.cf.public.springer.com",
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+ "://doi.org/10.25642/ipk/gbis/",
+ "://apex.ipk-gatersleben.de/",
+ "fao.org/glis/",
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
+ # DOI prefixes
+ "doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.18730/", # fao.org: database entry
+ "doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
+ # deprecated domain (doesn't redirect correctly)
+ "://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
+ ]
+
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
+ "/password-login",
+ "://gateway.isiknowledge.com/",
+ "/login?TARGET=",
+ "jstage.jst.go.jp/sblogin",
+ "://acw.elsevier.com/SSOCore",
+ "://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
+ ]
+
+ self.cookie_blocklist = [
+ "/cookieAbsent",
+ "cookieSet=1",
+ "error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
+ ]
+
+ self.src_valid_mimetypes = [
+ "text/x-tex",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip",
+ "application/x-tar",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ]
+
+ self.component_valid_mimetypes = [
+ "image/jpeg",
+ "image/tiff",
+ "image/png",
+ "image/gif",
+ "audio/mpeg",
+ "video/mp4",
+ "video/mpeg",
+ "text/plain",
+ "text/csv",
+ "text/x-r-source", # dataverse
+ "text/tab-separated-values", # dataverse
+ "text/x-rst", # dataverse
+ "application/x-rlang-transport", # dataverse
+ "application/json",
+ "application/xml",
+ "application/pdf",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip ",
+ "application/x-rar ",
+ "application/x-7z-compressed",
+ "application/x-tar",
+ "application/vnd.ms-powerpoint",
+ "application/vnd.ms-excel",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ]
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Check in sandcrawler-db (postgres) to see if we have already ingested
+ this URL (ingest file result table).
+
+ Returns existing row *if* found *and* we should use it, otherwise None.
+
+ Looks at existing ingest results and makes a decision based on, eg,
+ status and timestamp.
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def find_resource(
+ self, url: str, best_mimetype: Optional[str] = None, force_recrawl: bool = False
+ ) -> Optional[ResourceResult]:
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. If a hit isn't found, try crawling with SPN.
+ """
+ via = "none"
+ resource = None
+
+ if url.startswith("http://web.archive.org/web/") or url.startswith(
+ "https://web.archive.org/web/"
+ ):
+ raise NotImplementedError("handling direct wayback links not supported yet")
+
+ if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
+ raise NotImplementedError("fetching from archive.org not implemented yet")
+
+ if self.try_wayback and not force_recrawl:
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(url, best_mimetype)
+
+ # check for "soft 404" conditions, where we should retry with live SPNv2
+ soft404 = False
+ # NOTE: these are often not working with SPNv2 either, so disabling. If
+ # we really want to try again, should do force-recrawl
+ # if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
+ # soft404 = True
+
+ old_failure = False
+ if (
+ resource
+ and not resource.hit
+ and resource.terminal_dt
+ and resource.terminal_dt < "20190000000000"
+ ):
+ old_failure = True
+
+ if self.try_spn2 and (
+ resource is None
+ or (resource and resource.status == "no-capture")
+ or soft404
+ or old_failure
+ ):
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(url, self.wayback_client)
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via, (resource and resource.status), (resource and resource.terminal_url) or url
+ ),
+ file=sys.stderr,
+ )
+ return resource
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest file result, do any database fetches or
+ additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+ assert result_row["hit"]
+ existing_file_meta = self.pgrest_client.get_file_meta(result_row["terminal_sha1hex"])
+ existing_grobid = self.pgrest_client.get_grobid(result_row["terminal_sha1hex"])
+ existing_cdx = self.pgrest_client.get_cdx(
+ result_row["terminal_url"], result_row["terminal_dt"]
+ )
+ if not (existing_file_meta and existing_grobid and existing_cdx):
+ raise NotImplementedError("partially-exsiting records not implemented yet")
+ result = {
+ "hit": result_row["hit"],
+ "status": "existing",
+ "request": request,
+ "grobid": existing_grobid,
+ "file_meta": existing_file_meta,
+ "cdx": existing_cdx,
+ "terminal": {
+ "terminal_url": result_row["terminal_url"],
+ "terminal_dt": result_row["terminal_dt"],
+ "terminal_status_code": result_row["terminal_status_code"],
+ "terminal_sha1hex": result_row["terminal_sha1hex"],
+ },
+ }
+ return result
+
+ def process_file_hit(
+ self, ingest_type: str, resource: ResourceResult, file_meta: dict
+ ) -> dict:
+ """
+ Run all the necessary processing for a new/fresh ingest hit.
+ """
+ if (
+ ingest_type in ["dataset-file", "component"]
+ and file_meta["mimetype"] == "application/pdf"
+ ):
+ ingest_type = "pdf"
+ if ingest_type == "pdf":
+ return {
+ "grobid": self.process_grobid(resource, file_meta),
+ "pdf_meta": self.process_pdfextract(resource, file_meta),
+ }
+ elif ingest_type == "xml":
+ return {
+ "xml_meta": self.process_xml(resource, file_meta),
+ }
+ elif ingest_type == "html":
+ html_info = self.process_html(resource, file_meta)
+ # if there is no html_biblio, don't clobber anything possibly extracted earlier
+ if "html_biblio" in html_info and not html_info["html_biblio"]:
+ html_info.pop("html_biblio")
+ return html_info
+ elif ingest_type == "src":
+ return {}
+ elif ingest_type == "component":
+ return {}
+ elif ingest_type == "dataset-file":
+ return {}
+ else:
+ raise NotImplementedError(f"process {ingest_type} hit")
+
+ def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Submits to resource body to GROBID for processing.
+
+ TODO: By default checks sandcrawler-db for an existing row first, then
+ decide if we should re-process
+ """
+ if self.try_existing_grobid:
+ existing = self.pgrest_client.get_grobid(file_meta["sha1hex"])
+ if existing:
+ # grobid_timestamp = existing.get("grobid_timestamp") or None
+ # status
+ grobid_version = existing.get("grobid_version") or None
+ if grobid_version and grobid_version.startswith("0.7"):
+ print("found existing GROBID result", file=sys.stderr)
+ return existing
+
+ # Need to actually processes
+ result = self.grobid_client.process_fulltext(resource.body)
+ if self.grobid_sink:
+ # extra fields for GROBID kafka messages
+ result["file_meta"] = file_meta
+ result["key"] = result["file_meta"]["sha1hex"]
+ self.grobid_sink.push_record(result.copy())
+ if result["status"] == "success":
+ metadata = self.grobid_client.metadata(result)
+ if metadata:
+ result["metadata"] = metadata
+ result["fatcat_release"] = metadata.pop("fatcat_release", None)
+ result["grobid_version"] = metadata.pop("grobid_version", None)
+ result.pop("tei_xml", None)
+ result.pop("file_meta", None)
+ result.pop("key", None)
+ return result
+
+ def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Extracts thumbnail and pdf_meta info from PDF.
+
+ By default checks sandcrawler-db for an existing row first, then decide
+ if we should re-process.
+
+ TODO: difference between Kafka schema and SQL/postgrest schema
+ """
+ if self.try_existing_pdfextract:
+ existing = self.pgrest_client.get_pdf_meta(file_meta["sha1hex"])
+ if existing:
+ print("found existing pdf_meta result", file=sys.stderr)
+ result = PdfExtractResult.from_pdf_meta_dict(existing)
+ return result.to_pdftext_dict()
+
+ # Need to actually processes
+ result = process_pdf(resource.body)
+ assert result.sha1hex == file_meta["sha1hex"]
+ assert result.file_meta is not None
+ assert result.file_meta["sha1hex"] == file_meta["sha1hex"]
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ if self.pdftext_sink:
+ self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
+ result.page0_thumbnail = None
+ result.text = None
+ result.file_meta = None
+ return result.to_pdftext_dict()
+
+ def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Simply publishes to Kafka topic.
+
+ In the future, could extract other metadata here (like body word
+ count), or attempting to fetch sub-resources.
+ """
+ if self.xmldoc_sink and file_meta["mimetype"] == "application/jats+xml":
+ try:
+ jats_xml = xml_reserialize(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="xml-parse-error")
+ msg = dict(
+ sha1hex=file_meta["sha1hex"],
+ status="success",
+ jats_xml=jats_xml,
+ )
+ self.xmldoc_sink.push_record(msg, key=file_meta["sha1hex"])
+ return dict(status="success")
+
+ def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
+
+ assert resource.body
+ try:
+ html_doc = HTMLParser(resource.body)
+ except ValueError:
+ return dict(status="html-selectolax-error")
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
+ html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
+ html_scope = html_guess_scope(
+ resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
+ )
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
+
+ if html_scope in ("blocked-captcha", "blocked-cookie", "blocked-forbidden"):
+ return dict(
+ status=html_scope,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ )
+ elif html_scope not in (
+ "article-fulltext",
+ "unknown",
+ ):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="wrong-scope",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ raw_resources = html_extract_resources(
+ resource.terminal_url, html_doc, self.adblock_rules
+ )
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ if self.htmlteixml_sink and html_body["status"] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta["sha1hex"])
+
+ html_body.pop("tei_xml", None)
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ when = parse_cdx_datetime(resource.cdx.datetime)
+ full_resources: List[WebResource] = []
+
+ try:
+ if self.html_quick_mode:
+ print(" WARN: running quick CDX-only fetches", file=sys.stderr)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, self.wayback_client.cdx_client, when
+ )
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result["status"] = "petabox-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result["status"] = "cdx-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result["status"] = "wayback-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result["status"] = "wayback-content-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except NoCaptureError as e:
+ partial_result["status"] = "html-resource-no-capture"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+
+ info = dict(
+ html_body=html_body,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
+ )
+ if html_scope == "unknown":
+ info["status"] = "unknown-scope"
+ return info
+
+ def timeout_response(self, task: dict) -> dict:
+ print("[TIMEOUT]", file=sys.stderr)
+ return dict(
+ request=task,
+ hit=False,
+ status="timeout",
+ error_message="ingest worker internal timeout",
+ )
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("file", "pdf", "xml", "html", "src", "component"):
+ return False
+ return True
+
+ def process(self, request: dict, key: Any = None) -> dict:
+ return self.process_file(request, key=key)
+
+ def process_file(self, request: dict, key: Any = None) -> dict:
+
+ # old backwards compatibility
+ if request.get("ingest_type") == "file":
+ request["ingest_type"] = "pdf"
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("pdf", "xml", "html", "src", "component"):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ for block in self.base_url_blocklist:
+ if block in base_url:
+ print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+ return dict(request=request, hit=False, status="skip-url-blocklist")
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ best_mimetype = None
+ if ingest_type == "pdf":
+ best_mimetype = "application/pdf"
+ elif ingest_type == "xml":
+ best_mimetype = "text/xml"
+ elif ingest_type == "html":
+ best_mimetype = "text/html"
+ elif ingest_type == "src":
+ best_mimetype = "application/gzip"
+
+ existing = self.check_existing_ingest(ingest_type, base_url)
+ if existing:
+ return self.process_existing(request, existing)
+
+ result: Dict[str, Any] = dict(request=request, hit=False)
+
+ next_url = base_url
+ hops = [base_url]
+
+ while len(hops) <= self.max_hops:
+
+ result["hops"] = hops
+
+ # check against blocklist again on each hop
+ for block in self.base_url_blocklist:
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ # also check against known loginwall patterns
+ for block in self.wall_blocklist:
+ if block in next_url:
+ # TODO: blocked-wall instead of skip-wall
+ result["status"] = "skip-wall"
+ return result
+
+ # check for popular cookie blocking URL patterns. On successful SPN
+ # crawls, shouldn't see these redirect URLs
+ for pattern in self.cookie_blocklist:
+ if pattern in next_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ try:
+ resource = self.find_resource(
+ next_url, best_mimetype, force_recrawl=force_recrawl
+ )
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ assert resource
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ if ingest_type == "pdf" and html_ish_resource:
+
+ # the new style of URL extraction (already computed)
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
+
+ result["extract_next_hop"] = fulltext_url
+ if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
+ result["status"] = "no-pdf-link"
+ return result
+ next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
+ assert next_url
+ next_url = clean_url(next_url)
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ fulltext_url.get("technique"),
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif (
+ ingest_type in ("xml", "html", "component")
+ and html_ish_resource
+ and html_biblio
+ ):
+ # NOTE: src_fulltext_url is not a thing
+ next_url_found = None
+ if ingest_type == "xml" and html_biblio.xml_fulltext_url:
+ next_url_found = html_biblio.xml_fulltext_url
+ elif ingest_type == "html" and html_biblio.html_fulltext_url:
+ next_url_found = html_biblio.html_fulltext_url
+ elif ingest_type == "component" and html_biblio.component_url:
+ next_url_found = html_biblio.component_url
+
+ if next_url_found:
+ next_url = next_url_found
+ technique = "html_biblio"
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ if ingest_type == "html":
+ # for HTML ingest, we don't count this as a link-loop
+ break
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result["status"] = "max-hops-exceeded"
+ return result
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "src":
+ if file_meta["mimetype"] not in self.src_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "component":
+ if file_meta["mimetype"] not in self.component_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ raise NotImplementedError()
+
+ info = self.process_file_hit(ingest_type, resource, file_meta)
+ result.update(info)
+
+ # check if processing turned up an error
+ if info.get("status") not in ("success", None):
+ result["status"] = info["status"]
+ return result
+
+ result["status"] = "success"
+ result["hit"] = True
+ if ingest_type == "pdf":
+ print(
+ "[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ result.get("grobid", {}).get("status_code"),
+ result.get("pdf_meta", {}).get("status"),
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[SUCCESS {:>5}] sha1:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ ),
+ file=sys.stderr,
+ )
+ return result
+
+
+class IngestFileRequestHandler(BaseHTTPRequestHandler):
+ def do_POST(self) -> None:
+ if self.path != "/ingest":
+ self.send_response(404)
+ self.end_headers()
+ self.wfile.write(b"404: Not Found")
+ return
+ length = int(self.headers.get("content-length"))
+ request = json.loads(self.rfile.read(length).decode("utf-8"))
+ print("Got request: {}".format(request))
+ ingester = IngestFileWorker()
+ result = ingester.process(request)
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(json.dumps(result).encode("utf8"))
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
new file mode 100644
index 0000000..3acbece
--- /dev/null
+++ b/python/sandcrawler/ingest_fileset.py
@@ -0,0 +1,516 @@
+import json
+import sys
+import time
+from typing import Any, Dict, Optional
+
+import requests
+from selectolax.parser import HTMLParser
+
+from sandcrawler.fileset_platforms import (
+ ArchiveOrgHelper,
+ DataverseHelper,
+ FigshareHelper,
+ ZenodoHelper,
+)
+from sandcrawler.fileset_strategies import (
+ ArchiveorgFilesetStrategy,
+ ArchiveorgFileStrategy,
+ WebFilesetStrategy,
+ WebFileStrategy,
+)
+from sandcrawler.fileset_types import (
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (
+ CdxApiError,
+ PetaboxError,
+ SavePageNowError,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_file import IngestFileWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
+from sandcrawler.workers import SandcrawlerWorker
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFilesetWorker(IngestFileWorker):
+ """
+ General process is:
+
+ 1. crawl base_url, and use request and landing page resource (eg, HTML) to
+ determine platform being targeted
+ 2. use platform-specific helper to fetch metadata about the work, including
+ a manifest of files, and selection of an "ingest strategy" and any
+ required context
+ 3. then use strategy-specific helper to archive files from manifest (first
+ checking to see if content has been archived already)
+ 4. summarize status
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__(sink=None, **kwargs)
+
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.sink = sink
+ self.dataset_platform_helpers = {
+ "dataverse": DataverseHelper(),
+ "figshare": FigshareHelper(),
+ "zenodo": ZenodoHelper(),
+ "archiveorg": ArchiveOrgHelper(),
+ }
+ self.dataset_strategy_archivers = {
+ IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+ IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(try_spn2=self.try_spn2),
+ IngestStrategy.WebFile: WebFileStrategy(try_spn2=self.try_spn2),
+ }
+
+ self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024)
+ self.max_file_count = kwargs.get("max_file_count", 200)
+ self.ingest_file_result_sink = kwargs.get("ingest_file_result_sink")
+ self.ingest_file_result_stdout = kwargs.get("ingest_file_result_stdout", False)
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Same as file version, but uses fileset result table
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_fileset_platform(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest fileset result, do any database fetches
+ or additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("dataset",):
+ return False
+ return True
+
+ def fetch_resource_iteratively(
+ self, ingest_type: str, base_url: str, force_recrawl: bool
+ ) -> dict:
+ """
+ This is copypasta from process_file(), should probably refactor.
+ """
+
+ result: Dict[str, Any] = dict(hit=False)
+ result["hops"] = [base_url]
+ next_url = base_url
+
+ # check against blocklist
+ for block in self.base_url_blocklist:
+ # NOTE: hack to not skip archive.org content
+ if "archive.org" in block:
+ continue
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ try:
+ resource = self.find_resource(next_url, force_recrawl=force_recrawl)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ html_biblio = None
+ if resource:
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ # eg, datasets, components, etc
+ pass
+
+ result["_html_biblio"] = html_biblio
+ result["_resource"] = resource
+ return result
+
+ def process(self, request: dict, key: Any = None) -> dict:
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("dataset",):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ # TODO: "existing" check against file and/or fileset ingest result table
+ # existing = self.check_existing_ingest(ingest_type, base_url)
+ # if existing:
+ # return self.process_existing(request, existing)
+
+ result = self.fetch_resource_iteratively(
+ ingest_type, base_url, force_recrawl=force_recrawl
+ )
+ result["request"] = request
+ if result.get("status") is not None:
+ result["request"] = request
+ return result
+
+ html_biblio = result.pop("_html_biblio")
+ resource = result.pop("_resource")
+
+ # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+ # TODO: could involve html_guess_platform() here?
+
+ # determine platform
+ platform_helper = None
+ for (helper_name, helper) in self.dataset_platform_helpers.items():
+ if helper.match_request(request, resource, html_biblio):
+ platform_helper = helper
+ break
+
+ if not platform_helper:
+ result["status"] = "no-platform-match"
+ return result
+
+ # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+ try:
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ except PlatformScopeError as e:
+ result["status"] = "platform-scope"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PlatformRestrictedError as e:
+ result["status"] = "platform-restricted"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except requests.exceptions.HTTPError as e:
+ result["error_message"] = str(e)[:1600]
+ if e.response.status_code == 404:
+ result["status"] = "platform-404"
+ result["error_message"] = str(e)[:1600]
+ return result
+ else:
+ result["status"] = "platform-http-error"
+ return result
+ except requests.exceptions.RequestException as e:
+ result["error_message"] = str(e)[:1600]
+ result["status"] = "platform-error"
+ return result
+
+ # print(dataset_meta, file=sys.stderr)
+ platform = dataset_meta.platform_name
+ result["platform_name"] = dataset_meta.platform_name
+ result["platform_domain"] = dataset_meta.platform_domain
+ result["platform_id"] = dataset_meta.platform_id
+ result["platform_base_url"] = dataset_meta.web_base_url
+ result["archiveorg_item_name"] = dataset_meta.archiveorg_item_name
+
+ if not dataset_meta.manifest:
+ result["status"] = "empty-manifest"
+ return result
+
+ # these will get confirmed/updated after ingest
+ result["manifest"] = [m.dict(exclude_none=True) for m in dataset_meta.manifest]
+ result["file_count"] = len(dataset_meta.manifest)
+ result["total_size"] = sum([m.size for m in dataset_meta.manifest if m.size])
+
+ if result["total_size"] > self.max_total_size:
+ result["status"] = "too-large-size"
+ return result
+ if result["file_count"] > self.max_file_count:
+ # hard max, to prevent downstream breakage
+ if result["file_count"] > 10 * 1000:
+ result["manifest"] = result["manifest"][: self.max_file_count]
+ result["status"] = "too-many-files"
+ return result
+
+ ingest_strategy = platform_helper.chose_strategy(dataset_meta)
+ result["ingest_strategy"] = ingest_strategy
+ print(
+ f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}",
+ file=sys.stderr,
+ )
+
+ strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
+ if not strategy_helper:
+ result["status"] = "no-strategy-helper"
+ return result
+
+ # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+ try:
+ archive_result = strategy_helper.process(dataset_meta)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ # 4. Summarize status and return structured result metadata.
+ result["status"] = archive_result.status
+ result["manifest"] = [m.dict(exclude_none=True) for m in archive_result.manifest]
+
+ if ingest_strategy.endswith("-fileset-bundle"):
+ result["fileset_bundle"] = dict()
+ if archive_result.bundle_file_meta:
+ result["fileset_bundle"]["file_meta"] = archive_result.bundle_file_meta
+ if archive_result.bundle_archiveorg_path:
+ result["fileset_bundle"][
+ "archiveorg_bundle_path"
+ ] = archive_result.bundle_archiveorg_path
+ if archive_result.bundle_resource:
+ result["fileset_bundle"]["terminal"] = dict(
+ terminal_url=archive_result.bundle_resource.terminal_url,
+ terminal_dt=archive_result.bundle_resource.terminal_dt,
+ terminal_status_code=archive_result.bundle_resource.terminal_status_code,
+ )
+ if archive_result.bundle_resource.cdx:
+ result["fileset_bundle"]["cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.cdx
+ )
+ if archive_result.bundle_resource.revisit_cdx:
+ result["fileset_bundle"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.revisit_cdx
+ )
+
+ if ingest_strategy.endswith("-file"):
+ result["fileset_file"] = dict()
+ if archive_result.file_file_meta:
+ result["fileset_file"]["file_meta"] = (archive_result.file_file_meta,)
+ if archive_result.file_resource:
+ result["fileset_file"]["terminal"] = dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ )
+ if archive_result.file_resource.cdx:
+ result["fileset_file"]["cdx"] = cdx_to_dict(
+ archive_result.file_resource.cdx
+ )
+ if archive_result.file_resource.revisit_cdx:
+ result["fileset_file"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+
+ if result["status"].startswith("success"):
+ # check that these are still valid
+ assert result["file_count"] == len(archive_result.manifest)
+ assert result["total_size"] == sum(
+ [m.size for m in archive_result.manifest if m.size]
+ )
+
+ if (
+ result["status"] == "success-file"
+ and archive_result.file_resource
+ and archive_result.file_file_meta
+ ):
+ file_result: Dict[str, Any] = dict(
+ hit=True,
+ status="success",
+ request=request.copy(),
+ file_meta=archive_result.file_file_meta,
+ terminal=dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ terminal_sha1hex=archive_result.file_file_meta["sha1hex"],
+ ),
+ )
+ if archive_result.file_resource.cdx:
+ file_result["cdx"] = cdx_to_dict(archive_result.file_resource.cdx)
+ if archive_result.file_resource.revisit_cdx:
+ file_result["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+ file_result["request"]["ingest_type"] = request["ingest_type"] + "-file"
+ # call the super() (ingest_file) version of process_hit()
+ info = self.process_file_hit(
+ file_result["request"]["ingest_type"],
+ archive_result.file_resource,
+ archive_result.file_file_meta,
+ )
+ file_result.update(info)
+ if self.ingest_file_result_sink:
+ self.ingest_file_result_sink.push_record(result.copy())
+ elif self.ingest_file_result_stdout:
+ sys.stdout.write(json.dumps(file_result, sort_keys=True) + "\n")
+
+ if result["status"].startswith("success"):
+ result["hit"] = True
+ print(
+ "[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["status"],
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ return result
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/ingest_html.py
index 91b9cd6..fb42e71 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,27 +1,43 @@
-
-import io
-import sys
-import json
-import datetime
import argparse
+import datetime
+import json
+import sys
import xml.etree.ElementTree as ET
-from typing import List, Optional, Any, Tuple
+from typing import Any, List, Optional, Tuple
-import trafilatura
import pydantic
+import trafilatura
from selectolax.parser import HTMLParser
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-
+from sandcrawler.html_metadata import (
+ BiblioMetadata,
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiClient,
+ NoCaptureError,
+ WaybackClient,
+ WaybackContentError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.misc import (
+ datetime_to_cdx,
+ gen_file_metadata,
+ parse_cdx_datetime,
+ url_fuzzy_equal,
+)
TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
def html_extract_body_teixml(doc: bytes) -> dict:
try:
- tei_xml = trafilatura.extract(doc,
- tei_output=True,
+ tei_xml = trafilatura.extract(
+ doc,
+ output_format="xmltei",
include_comments=False,
include_formatting=True,
)
@@ -33,22 +49,28 @@ def html_extract_body_teixml(doc: bytes) -> dict:
if tei_xml:
body_txt = teixml_body_text(tei_xml)
word_count = len(body_txt.split())
- return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count)
- elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+ return dict(
+ status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count
+ )
+ elif doc.startswith(
+ b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+ ):
# hack for firstmonday.org
return html_extract_body_teixml(doc[106:])
else:
return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
+
def teixml_body_text(doc_xml: str) -> str:
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
tree = ET.fromstring(doc_xml)
- body = tree.find('.//tei:body', ns)
+ body = tree.find(".//tei:body", ns)
if body:
return " ".join(body.itertext())
else:
return ""
+
class WebResource(pydantic.BaseModel):
surt: str
timestamp: datetime.datetime
@@ -61,16 +83,15 @@ class WebResource(pydantic.BaseModel):
resource_type: Optional[str]
class Config:
- json_encoders = {
- datetime.datetime: lambda dt: dt.isoformat()
- }
+ json_encoders = {datetime.datetime: lambda dt: dt.isoformat()}
+
class IngestWebResult(pydantic.BaseModel):
status: str
hit: bool
error_message: Optional[str]
cdx: Optional[dict]
- terminal: Optional[Any] # TODO
+ terminal: Optional[Any] # TODO
request: Optional[Any] # TODO
file_meta: Optional[dict]
html_biblio: Optional[BiblioMetadata]
@@ -84,6 +105,7 @@ class IngestWebResult(pydantic.BaseModel):
datetime.datetime: lambda dt: dt.isoformat(),
}
+
class HtmlMetaRow(pydantic.BaseModel):
sha1hex: str
status: str
@@ -106,7 +128,7 @@ class HtmlMetaRow(pydantic.BaseModel):
"""
return (
self.sha1hex,
- datetime.datetime.now(), # updated
+ datetime.datetime.now(), # updated
self.status,
self.scope,
self.has_teixml,
@@ -117,7 +139,9 @@ class HtmlMetaRow(pydantic.BaseModel):
)
-def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def quick_fetch_html_resources(
+ resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
"""
This is the lazy version that just does a CDX lookup for each resource.
@@ -128,31 +152,37 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
full = []
closest = when and datetime_to_cdx(when)
for resource in resources:
- cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
+ cdx_row = cdx_client.lookup_best(resource["url"], closest=closest)
if not cdx_row:
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
- print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+ if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
+ print(
+ f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
+ )
if not cdx_row.status_code:
# TODO: fall back to a full fetch?
- print(f" WARN: skipping revisit record", file=sys.stderr)
+ print(" WARN: skipping revisit record", file=sys.stderr)
continue
- full.append(WebResource(
- surt=cdx_row.surt,
- timestamp=cdx_row.datetime,
- url=cdx_row.url,
- sha1hex=cdx_row.sha1hex,
- mimetype=cdx_row.mimetype,
- status_code=cdx_row.status_code,
- size=None,
- sha256hex=None,
- resource_type=resource['type'],
- ))
+ full.append(
+ WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource["type"],
+ )
+ )
return full
-def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def fetch_html_resources(
+ resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
"""
This is the full version which fetches each resource from wayback/petabox
and calculates additional hashes.
@@ -163,33 +193,50 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
full = []
closest = when and datetime_to_cdx(when)
for resource in resources:
- wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
- if not wayback_resp or wayback_resp.status != 'success':
+ wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
+ if not wayback_resp or wayback_resp.status != "success":
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
- if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
- raise WaybackContentError("wayback payload sha1hex mismatch: {wayback_resp.cdx.url}")
- full.append(WebResource(
- surt=wayback_resp.cdx.surt,
- timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
- url=wayback_resp.cdx.url,
- sha1hex=file_meta['sha1hex'],
- mimetype=file_meta['mimetype'],
- status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code,
- size=file_meta['size_bytes'],
- sha256hex=file_meta['sha256hex'],
- resource_type=resource['type'],
- ))
+ # for HTML sub-resources specifically, we allow the CDX SHA1 to match
+ # either the transfer-encoded or inner (un-encoded) payload body to
+ # match. This is because of an ambiguity in the WARC specification
+ outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
+ if (
+ file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ ):
+ raise WaybackContentError(
+ f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}"
+ )
+ full.append(
+ WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta["sha1hex"],
+ mimetype=file_meta["mimetype"],
+ status_code=wayback_resp.cdx.status_code
+ or wayback_resp.revisit_cdx.status_code,
+ size=file_meta["size_bytes"],
+ sha256hex=file_meta["sha256hex"],
+ resource_type=resource["type"],
+ )
+ )
return full
-def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+def html_guess_platform(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
+) -> Optional[str]:
generator: Optional[str] = None
generator_elem = doc.css_first("meta[name='generator']")
if generator_elem:
- generator = generator_elem.attrs['content']
+ generator = generator_elem.attrs["content"]
else:
generator_elem = doc.css_first("a[id='developedBy']")
if generator_elem:
@@ -200,12 +247,21 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
return "ojs"
elif generator and "plone" in generator.lower():
return "plone"
+ elif generator and "wordpress" in generator.lower():
+ return "wordpress"
+ elif generator and "blogger" in generator.lower():
+ return "blogger"
elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
return "ojs"
else:
try:
- if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+ if (
+ 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
+ in doc.html
+ ):
return "ojs"
+ if '<a href="https://www.pubpub.org">Published with' in doc.html:
+ return "pubpub"
if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
return "arpha"
if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
@@ -214,18 +270,21 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
pass
icon_elem = doc.css_first("link[type='image/x-icon']")
- if icon_elem and 'href' in icon_elem.attrs:
- if 'journalssystem.com' in icon_elem.attrs['href']:
+ if icon_elem and "href" in icon_elem.attrs:
+ if "journalssystem.com" in icon_elem.attrs["href"]:
return "journalssystem.com"
- elif 'indexcopernicus.com' in icon_elem.attrs['href']:
+ elif "indexcopernicus.com" in icon_elem.attrs["href"]:
return "indexcopernicus"
- if 'scielo' in url:
+ if "scielo" in url:
return "scielo"
return None
-def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
+
+def html_guess_scope(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]
+) -> str:
"""
This function tries to guess if an HTML document represents one of:
@@ -236,6 +295,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
- component
- issue-fulltext
- landingpage
+ - homepage-domain
- blocked-paywall
- blocked-login
- blocked-captcha
@@ -249,6 +309,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
fulltext or a landing page, but could be one of the other categories.
"""
+ # assert that this is a real URL
+ assert url.count("/") >= 2
+
# basic paywall and loginwall detection based on URL
if url.endswith("/cookieAbsent"):
return "blocked-cookie"
@@ -264,6 +327,10 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if "showcaptcha.asp" in url:
return "blocked-captcha"
+ # is this the top-level URL of the domain? aka, no path?
+ if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"):
+ return "homepage-domain"
+
platform = html_guess_platform(url, doc, biblio)
if biblio:
@@ -308,13 +375,17 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if word_count is not None:
if word_count < 20:
return "stub"
+ elif word_count > 500 and platform in ["wordpress", "blogger"]:
+ return "article-fulltext"
elif word_count > 1200:
return "article-fulltext"
return "unknown"
-def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
+def run_single(
+ url: str, timestamp: Optional[str] = None, quick_mode: bool = False
+) -> IngestWebResult:
adblock = load_adblock_rules()
wayback_client = WaybackClient()
@@ -332,7 +403,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
file_meta = gen_file_metadata(html_resource.body)
file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
- if file_meta['mimetype'] not in ("text/html", "text/xml"):
+ if file_meta["mimetype"] not in ("text/html", "text/xml"):
return IngestWebResult(
status="wrong-mimetype",
hit=False,
@@ -343,8 +414,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
html_doc = HTMLParser(html_resource.body)
html_biblio = html_extract_biblio(url, html_doc)
html_body = html_extract_body_teixml(html_resource.body)
- html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count'))
- if html_scope not in ('article-fulltext', 'unknown'):
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count"))
+ if html_scope not in ("article-fulltext", "unknown"):
return IngestWebResult(
status="wrong-scope",
hit=False,
@@ -361,7 +432,9 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
full_resources: List[WebResource] = []
if quick_mode:
- full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, wayback_client.cdx_client, when
+ )
else:
full_resources = fetch_html_resources(raw_resources, wayback_client, when)
@@ -382,12 +455,10 @@ def main() -> None:
"""
Run this command like:
- python -m sandcrawler.html_ingest
+ python -m sandcrawler.ingest_html
"""
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
subparsers = parser.add_subparsers()
sub = subparsers.add_parser(
@@ -419,9 +490,10 @@ def main() -> None:
result = run_single(args.url, args.timestamp, args.quick_mode)
print(result.json(indent=2, exclude_none=True))
else:
- #func = getattr(wp, args.func)
- #func()
+ # func = getattr(wp, args.func)
+ # func()
raise NotImplementedError()
+
if __name__ == "__main__":
main()
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..8836515 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,14 +1,18 @@
-
-import io
-import os
import hashlib
+import io
+from typing import Optional, Tuple, Union
import minio
class SandcrawlerMinioClient(object):
-
- def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+ def __init__(
+ self,
+ host_url: str,
+ access_key: str,
+ secret_key: str,
+ default_bucket: Optional[str] = None,
+ ):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -28,7 +32,7 @@ class SandcrawlerMinioClient(object):
)
self.default_bucket = default_bucket
- def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+ def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
if not extension:
extension = ""
if not prefix:
@@ -44,7 +48,15 @@ class SandcrawlerMinioClient(object):
)
return obj_path
- def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ def put_blob(
+ self,
+ folder: str,
+ blob: Union[str, bytes],
+ sha1hex: Optional[str] = None,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> Tuple[str, str]:
"""
blob should be bytes
sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -53,7 +65,7 @@ class SandcrawlerMinioClient(object):
filename is SHA1 with an optional file extension.
"""
if type(blob) == str:
- blob = blob.encode('utf-8')
+ blob = blob.encode("utf-8")
assert type(blob) == bytes
if not sha1hex:
h = hashlib.sha1()
@@ -64,13 +76,13 @@ class SandcrawlerMinioClient(object):
bucket = self.default_bucket
assert bucket
content_type = "application/octet-stream"
- if extension.endswith('.xml'):
+ if extension.endswith(".xml"):
content_type = "application/xml"
- if extension.endswith('.png'):
+ if extension.endswith(".png"):
content_type = "image/png"
- elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ elif extension.endswith(".jpg") or extension.endswith(".jpeg"):
content_type = "image/jpeg"
- elif extension.endswith('.txt'):
+ elif extension.endswith(".txt"):
content_type = "text/plain"
self.mc.put_object(
bucket,
@@ -81,7 +93,14 @@ class SandcrawlerMinioClient(object):
)
return (bucket, obj_path)
- def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ def get_blob(
+ self,
+ folder: str,
+ sha1hex: str,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> bytes:
"""
sha1hex is sha1 of the blob itself
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index a3e2960..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,39 +1,50 @@
-
import base64
-import magic
-import hashlib
import datetime
-from typing import Optional
+import hashlib
+import os
+from typing import List, Optional
+import magic
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
def clean_url(s: str) -> str:
s = s.strip()
parsed = urlcanon.parse_url(s)
if not parsed.port and parsed.colon_before_port:
- parsed.colon_before_port = b''
+ parsed.colon_before_port = b""
return str(urlcanon.whatwg(parsed))
+
def url_fuzzy_equal(left: str, right: str) -> bool:
"""
TODO: use proper surt library and canonicalization for this check
"""
- fuzzy_left = '://'.join(clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:])
- fuzzy_right = '://'.join(clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:])
+ fuzzy_left = "://".join(
+ clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
+ fuzzy_right = "://".join(
+ clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
if fuzzy_left == fuzzy_right:
return True
elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
return True
return False
+
def test_url_fuzzy_equal() -> None:
- assert True == url_fuzzy_equal(
- "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
- "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree")
+ assert (
+ url_fuzzy_equal(
+ "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ )
+ is True
+ )
+
def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
"""
@@ -44,12 +55,15 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
assert blob is not None
if not allow_empty:
assert blob
- mimetype = magic.Magic(mime=True).from_buffer(blob)
+ if len(blob) < 1024 * 1024:
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ else:
+ mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
if mimetype in ("application/xml", "text/xml"):
# crude checks for XHTML or JATS XML, using only first 1 kB of file
if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
mimetype = "application/xhtml+xml"
- elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
@@ -66,6 +80,49 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
mimetype=mimetype,
)
+
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+ """
+ Variant of gen_file_metadata() which works with files on local disk
+ """
+ assert path is not None
+ mimetype = magic.Magic(mime=True).from_file(path)
+ if mimetype in ("application/xml", "text/xml"):
+ with open(path, "rb") as f:
+ blob = f.read(1024)
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if (
+ b"<htm" in blob[:1024]
+ and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+ ):
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ size_bytes = 0
+ with open(path, "rb") as f:
+ while True:
+ chunk = f.read(1024 * 1024)
+ if not chunk:
+ break
+ size_bytes += len(chunk)
+ for h in hashes:
+ h.update(chunk)
+ if not allow_empty:
+ assert size_bytes > 0
+ return dict(
+ size_bytes=size_bytes,
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+
def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -79,16 +136,18 @@ def b32_hex(s: str) -> str:
if len(s) == 40:
return s
raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
NORMAL_MIME = (
- 'application/pdf',
- 'application/postscript',
- 'text/html',
- 'text/xml',
- 'application/octet-stream',
+ "application/pdf",
+ "application/postscript",
+ "text/html",
+ "text/xml",
+ "application/octet-stream",
)
+
def normalize_mime(raw: str) -> Optional[str]:
raw = raw.lower().strip()
for norm in NORMAL_MIME:
@@ -96,28 +155,26 @@ def normalize_mime(raw: str) -> Optional[str]:
return norm
# Special cases
- if raw.startswith('application/xml'):
- return 'text/xml'
- if raw.startswith('application/x-pdf'):
- return 'application/pdf'
+ if raw.startswith("application/xml"):
+ return "text/xml"
+ if raw.startswith("application/x-pdf"):
+ return "application/pdf"
+ if raw in (".pdf",):
+ return "application/pdf"
if raw in (
- '.pdf',
- ):
- return 'application/pdf'
- if raw in (
- 'application/download',
- 'binary/octet-stream',
- 'unk',
- 'application/x-download',
- 'application/octetstream',
- 'application/force-download',
- 'application/unknown',
- ):
- return 'application/octet-stream'
+ "application/download",
+ "binary/octet-stream",
+ "unk",
+ "application/x-download",
+ "application/octetstream",
+ "application/force-download",
+ "application/unknown",
+ ):
+ return "application/octet-stream"
return None
-def test_normalize_mime():
+def test_normalize_mime() -> None:
assert normalize_mime("asdf") is None
assert normalize_mime("application/pdf") == "application/pdf"
assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -130,7 +187,7 @@ def test_normalize_mime():
assert normalize_mime("binary/octet-stream") == "application/octet-stream"
-def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
"""
This method always filters a few things out:
@@ -151,14 +208,19 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
offset = cdx[9]
warc = cdx[10]
- if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
- and len(sha1b32) == 32 and dt.isdigit()):
+ if not (
+ sha1b32.isalnum()
+ and c_size.isdigit()
+ and offset.isdigit()
+ and len(sha1b32) == 32
+ and dt.isdigit()
+ ):
return None
- if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
return None
- if mime is None or mime == '-':
+ if mime is None or mime == "-":
mime = "application/octet-stream"
if normalize:
@@ -179,6 +241,7 @@ def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
warc_path=warc,
)
+
def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
if not dt_str:
return None
@@ -187,23 +250,39 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
except Exception:
return None
+
def test_parse_cdx_datetime() -> None:
- assert parse_cdx_datetime("") == None
- assert parse_cdx_datetime("asdf") == None
- assert parse_cdx_datetime("19930203123045") != None
- assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+ assert parse_cdx_datetime("") is None
+ assert parse_cdx_datetime("asdf") is None
+ assert parse_cdx_datetime("19930203123045") is not None
+ assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+ year=2020, month=10, day=28, hour=23, minute=51, second=3
+ )
+
def datetime_to_cdx(dt: datetime.datetime) -> str:
- return '%04d%02d%02d%02d%02d%02d' % (
- dt.year, dt.month, dt.day,
- dt.hour, dt.minute, dt.second,
+ return "%04d%02d%02d%02d%02d%02d" % (
+ dt.year,
+ dt.month,
+ dt.day,
+ dt.hour,
+ dt.minute,
+ dt.second,
)
+
def test_datetime_to_cdx() -> None:
- assert "20201028235103" == datetime_to_cdx(datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
+ assert "20201028235103" == datetime_to_cdx(
+ datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+ )
-def requests_retry_session(retries=10, backoff_factor=3,
- status_forcelist=(500, 502, 504), session=None) -> requests.Session:
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 1,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: Optional[requests.Session] = None,
+) -> requests.Session:
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
@@ -216,7 +295,23 @@ def requests_retry_session(retries=10, backoff_factor=3,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
return session
+
+def sanitize_fs_path(path: str) -> str:
+ """
+ From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+ """
+ # - pretending to chroot to the current directory
+ # - cancelling all redundant paths (/.. = /)
+ # - making the path relative
+ return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+
+def test_sanitize_fs_path() -> None:
+ assert sanitize_fs_path("/thing.png") == "thing.png"
+ assert sanitize_fs_path("../../thing.png") == "thing.png"
+ assert sanitize_fs_path("thing.png") == "thing.png"
+ assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 311bbf8..97d338e 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,22 +1,21 @@
-
-import sys
-import json
import datetime
-from io import BytesIO
+import json
+import sys
from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
import poppler
from PIL import Image
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .ia import WaybackClient
from .misc import gen_file_metadata
-
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
# This is a hack to work around timeouts when processing certain PDFs with
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
# these, maybe due to threading.
-BAD_PDF_SHA1HEX = [
+BAD_PDF_SHA1HEX: List[str] = [
"011478a1e63a2a31eae1a93832a74cc95f220760",
"018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
"057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
@@ -57,6 +56,7 @@ BAD_PDF_SHA1HEX = [
"43a8c0abf0386d3e3397cf5e22a884761dd63db7",
"445968ef735b228c08c3ff4238d99fc9f4824619",
"447fa6b5a90742a86429a932f6608d8e141688c0",
+ "45f014d7d631559dc7726e5c5513f1e7c91c48a9",
"47577ff6d6876117ca69bec60a5764f7d2c2ec70",
"4785181cec8944eee00ddb631a5dfc771b89bab7",
"47db2db2cc976429568841a0496c0ab4ed7b5977",
@@ -65,37 +65,49 @@ BAD_PDF_SHA1HEX = [
"4edc1402712fa6827c4501fed8042e9f4447829c",
"50b3c5a3122272aca69855ef06b85d0b43a76eb1",
"52fc9b3c5199ef395d410c7cee5961dc812e4d29",
+ "53471346019947a88c1ba141fb829375527153b0",
"58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
"59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
"5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5c5b45c85eff07d4302844e00ec8baa57b988c60",
"5e04779cbbae5ce88bb786064f756885dd6895fe",
"5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "62247fe6b8d3ca50477cafddbe24bf63832d6674",
"623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
"646c4a654270606256397684204ff0f3d17be2e7",
"64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "668b7d777203af4b261d21bf4669fc9b385062e1",
"689b5cb3ddef213d612363a903f10d0358ea64d2",
"6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
"74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "7596438d77444a7c4228bb96fa4b394ba7d7e23b",
"75c2662a96ccc48891228df7c85eb7d4da9dd621",
"771f1ca0007a6fbed5b4a434c73f524f715d33c1",
"776859635e9dc01d97b0582f49c814ffbcb019fb",
"781dafda896a9f5c30f3d0a011f79a3b79b574c4",
"788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
"79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7b8b7e8e4b789579a7d2fda329db52528383a652",
+ "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e",
+ "7cfc0739be9c49d94272110a0a748256bdde9be6",
"7daf61526ec825151f384cc1db510ca5237d5d80",
"7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae",
"8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
"859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
"88edcbab1cac2d70af5870422974afc253f4f0c6",
"89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
"8dcaf4ef132900dd378f7be526c884b17452713b",
"8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "8ec1a17ec19ae8ade95b9bdc837236981e83fffb",
"949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
"961ec451172f373f919c593737466300e42062cb",
"976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "977f23723027d7052df9b49eb467e6c0b9af93ff",
"98b02eb70066c182c705ef4d14d8b723ad7f1fab",
"993ca31f6974f8387bb18dd7d38987d290da8781",
"9dbd05af3442e6f42d67868054751b76973f4171",
+ "a1cc781c694a48e018f4de110b58f561aa212051",
"a2298c137b9c8c8975bad62eea9224edb95e6952",
"a2671738755ab8b24775e95375dc72f1ca4e5fd6",
"a26f299fb97c646effeebd4c5e2968786bd0f781",
@@ -104,7 +116,9 @@ BAD_PDF_SHA1HEX = [
"a69665d0b5d3b95f54f68406eee3ed50c67efb45",
"a8357c31837404f9ebd798999d546c9398ab3648",
"a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "abc9d264df446707b40d7c9f79befd0f89291e59",
"ad038725bf6855a79f3c768ebe93c7103d14522f",
+ "aef581bf42e76e527f5aed3b8958fd4e7a24819f",
"b2b66b9c7f817a20144456f99c0be805602e8597",
"b2d719120306b90eb8dd3580b699a61ec70556f4",
"b4b8e18e27f102e59b2be2d58c7b54d0a0eb457a",
@@ -113,9 +127,11 @@ BAD_PDF_SHA1HEX = [
"b8b427e5b3d650ba9e03197f9c3917e25b878930",
"bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
"be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "beff1b0c24aa99989be73c66dfb1d1e7578e370b",
"c1b583fbd052572f08158d39ffe4d7510dadbebb",
"c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
"c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c58e028269c8dfd3a442f6745c81b4c0e8610c43",
"c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
"c7687fa6f637c7d32a25be0e772867d87536d35c",
"c7d8b37ec99cf0d987e60667f05299f200e18a5d",
@@ -126,101 +142,117 @@ BAD_PDF_SHA1HEX = [
"d055c054c330f99ec011e37186d2b429339758fd",
"d17b1e254cce82df5c6eb4fd492cef91e7e11558",
"d188762a7e3ab5d4ee8a897204316513e4e636ec",
+ "d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
"d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "d91d3830bf455e6dd782eee46218e35d29f07dfd",
"da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "dbb3093a797e0ae83d39eb7b235ff85a17fd965c",
"e01bb7256d77aea258313bb410dfcfc10512f420",
"e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
"e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
"e64714a81f60ab9286ec90cad682cb22e564fb6f",
"e9d7716b4f94bbc3d94459b5fe9bb8b15cb2e433",
+ "e9e84e17383e93a784a8471708619162b32fb399",
"eac7df5f799983d5a7cc55d10b4d426dc557febf",
+ "eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
"eb1b39fd7a874896688855a22efddef10272427c",
"eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ecc4b927c5e84e145c610876931bc261ae13769b",
"edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
+ "ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
"ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
+ "ef1dfa325c21cff4cd8bb1a9b6c4ee6996d43c8f",
"ef6749d9263a01f921ba7d72df0d17671d14e5f6",
"f0ea221d8587cede25592266486e119d277f7096",
"f68f9a9202a75d2aee35252e104d796f9515001e",
"f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "f932ef936021a3b00842b481478c40868b9a007c",
"fd9bd560662e070b222d63052830837829c490f0",
]
+
@dataclass
class PdfExtractResult:
sha1hex: str
status: str
error_msg: Optional[str] = None
- file_meta: Optional[Dict[str,Any]] = None
+ file_meta: Optional[Dict[str, Any]] = None
text: Optional[str] = None
page0_thumbnail: Optional[bytes] = None
has_page0_thumbnail: bool = False
meta_xml: Optional[str] = None
- pdf_info: Optional[Dict[str,Any]] = None
- pdf_extra: Optional[Dict[str,Any]] = None
- source: Optional[Dict[str,Any]] = None
+ pdf_info: Optional[Dict[str, Any]] = None
+ pdf_extra: Optional[Dict[str, Any]] = None
+ source: Optional[Dict[str, Any]] = None
def to_pdftext_dict(self) -> dict:
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
return {
- 'key': self.sha1hex,
- 'sha1hex': self.sha1hex,
- 'status': self.status,
- 'file_meta': self.file_meta,
- 'error_msg': self.error_msg,
- 'text': self.text,
- 'has_page0_thumbnail': self.has_page0_thumbnail,
- 'meta_xml': self.meta_xml,
- 'pdf_info': self.pdf_info,
- 'pdf_extra': self.pdf_extra,
- 'source': self.source,
+ "key": self.sha1hex,
+ "sha1hex": self.sha1hex,
+ "status": self.status,
+ "file_meta": self.file_meta,
+ "error_msg": self.error_msg,
+ "text": self.text,
+ "has_page0_thumbnail": self.has_page0_thumbnail,
+ "meta_xml": self.meta_xml,
+ "pdf_info": self.pdf_info,
+ "pdf_extra": self.pdf_extra,
+ "source": self.source,
}
- @classmethod
- def from_pdftext_dict(cls, record):
+ @staticmethod
+ def from_pdftext_dict(record: Dict[str, Any]) -> "PdfExtractResult":
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
- if record['status'] != 'success':
+ if record["status"] != "success":
return PdfExtractResult(
- sha1hex=record.get('sha1hex') or record['key'],
- status=record['status'],
- error_msg=record.get('error_msg'),
+ sha1hex=record.get("sha1hex") or record["key"],
+ status=record["status"],
+ error_msg=record.get("error_msg"),
)
else:
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- file_meta=record.get('file_meta'),
- text=record.get('text'),
- has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
- meta_xml=record.get('meta_xml'),
- pdf_info=record.get('pdf_info'),
- pdf_extra=record.get('pdf_extra'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ file_meta=record.get("file_meta"),
+ text=record.get("text"),
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ meta_xml=record.get("meta_xml"),
+ pdf_info=record.get("pdf_info"),
+ pdf_extra=record.get("pdf_extra"),
)
- @classmethod
- def from_pdf_meta_dict(cls, record):
+ @staticmethod
+ def from_pdf_meta_dict(record: Dict[str, Any]) -> "PdfExtractResult":
"""
Parses what would be returned from postgrest
"""
- if record['status'] != 'success':
+ if record["status"] != "success":
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- error_msg=(record.get('metadata') or {}).get('error_msg'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ error_msg=(record.get("metadata") or {}).get("error_msg"),
)
else:
pdf_extra = dict()
- for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id', 'pdf_version'):
+ for k in (
+ "page_count",
+ "page0_height",
+ "page0_width",
+ "permanent_id",
+ "pdf_version",
+ ):
if record.get(k):
pdf_extra[k] = record[k]
return PdfExtractResult(
- sha1hex=record['sha1hex'],
- status=record['status'],
- has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)),
- pdf_info=record.get('metadata'),
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ pdf_info=record.get("metadata"),
pdf_extra=pdf_extra,
)
@@ -237,31 +269,33 @@ class PdfExtractResult:
# TODO: form, encrypted
if self.pdf_info:
metadata = dict()
- for k in ('Title', 'Subject', 'Author', 'Creator', 'Producer', 'doi'):
+ for k in ("Title", "Subject", "Author", "Creator", "Producer", "doi"):
if k in self.pdf_info:
metadata[k.lower()] = self.pdf_info[k]
- if 'CreationDate' in self.pdf_info:
- pdf_created = self.pdf_info['CreationDate']
+ if "CreationDate" in self.pdf_info:
+ pdf_created = self.pdf_info["CreationDate"]
metadata_json: Optional[str] = None
if metadata:
metadata_json = json.dumps(metadata, sort_keys=True)
return (
self.sha1hex,
- datetime.datetime.now(), # updated
+ datetime.datetime.now(), # updated
self.status,
self.has_page0_thumbnail,
- pdf_extra.get('page_count'),
+ pdf_extra.get("page_count"),
word_count,
- pdf_extra.get('page0_height'),
- pdf_extra.get('page0_width'),
- pdf_extra.get('permanent_id'),
+ pdf_extra.get("page0_height"),
+ pdf_extra.get("page0_width"),
+ pdf_extra.get("permanent_id"),
pdf_created,
- pdf_extra.get('pdf_version'),
+ pdf_extra.get("pdf_version"),
metadata_json,
)
-def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtractResult:
+def process_pdf(
+ blob: bytes, thumb_size: Tuple[int, int] = (180, 300), thumb_type: str = "JPEG"
+) -> PdfExtractResult:
"""
A known issue is that output text is in "physical layout" mode, which means
columns will be side-by-side. We would prefer a single stream of tokens!
@@ -271,11 +305,11 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
didn't seem to work at all (returned empty strings).
"""
file_meta = gen_file_metadata(blob)
- sha1hex = file_meta['sha1hex']
- if file_meta['mimetype'] != 'application/pdf':
+ sha1hex = file_meta["sha1hex"]
+ if file_meta["mimetype"] != "application/pdf":
return PdfExtractResult(
sha1hex=sha1hex,
- status='not-pdf',
+ status="not-pdf",
error_msg=f"mimetype is '{file_meta['mimetype']}'",
file_meta=file_meta,
)
@@ -283,8 +317,8 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if sha1hex in BAD_PDF_SHA1HEX:
return PdfExtractResult(
sha1hex=sha1hex,
- status='bad-pdf',
- error_msg=f"PDF known to cause processing issues",
+ status="bad-pdf",
+ error_msg="PDF known to cause processing issues",
file_meta=file_meta,
)
@@ -294,7 +328,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if pdf is None:
return PdfExtractResult(
sha1hex=sha1hex,
- status='empty-pdf',
+ status="empty-pdf",
file_meta=file_meta,
has_page0_thumbnail=False,
)
@@ -302,17 +336,18 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
if page0 is None:
return PdfExtractResult(
sha1hex=sha1hex,
- status='empty-page0',
+ status="empty-page0",
file_meta=file_meta,
)
# this call sometimes fails an returns an AttributeError
page0rect = page0.page_rect()
- except (AttributeError, poppler.document.LockedDocumentError) as e:
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
# may need to expand the set of exceptions caught here over time, but
# starting with a narrow set
return PdfExtractResult(
sha1hex=sha1hex,
- status='parse-error',
+ status="parse-error",
error_msg=str(e),
file_meta=file_meta,
)
@@ -322,7 +357,9 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
renderer = poppler.PageRenderer()
try:
full_img = renderer.render_page(page0)
- img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', "BGRA", 0, 1)
+ img = Image.frombuffer(
+ "RGBA", (full_img.width, full_img.height), full_img.data, "raw", "BGRA", 0, 1
+ )
img.thumbnail(thumb_size, Image.BICUBIC)
buf = BytesIO()
img.save(buf, thumb_type)
@@ -342,23 +379,23 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
except AttributeError as e:
return PdfExtractResult(
sha1hex=sha1hex,
- status='parse-error',
+ status="parse-error",
error_msg=str(e),
file_meta=file_meta,
)
# Kafka message size limit; cap at about 1 MByte
- if len(full_text)> 1000000:
+ if len(full_text) > 1000000:
return PdfExtractResult(
sha1hex=sha1hex,
- status='text-too-large',
+ status="text-too-large",
error_msg="full_text chars: {}".format(len(full_text)),
file_meta=file_meta,
)
- if len(pdf.metadata)> 1000000:
+ if len(pdf.metadata) > 1000000:
return PdfExtractResult(
sha1hex=sha1hex,
- status='text-too-large',
+ status="text-too-large",
error_msg="meta_xml chars: {}".format(len(full_text)),
file_meta=file_meta,
)
@@ -368,7 +405,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
except UnicodeDecodeError:
return PdfExtractResult(
sha1hex=sha1hex,
- status='bad-unicode',
+ status="bad-unicode",
error_msg="in infos()",
file_meta=file_meta,
)
@@ -389,7 +426,7 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
return PdfExtractResult(
sha1hex=sha1hex,
file_meta=file_meta,
- status='success',
+ status="success",
error_msg=None,
text=full_text or None,
has_page0_thumbnail=page0_thumbnail is not None,
@@ -406,16 +443,21 @@ def process_pdf(blob: bytes, thumb_size=(180,300), thumb_type="JPEG") -> PdfExtr
),
)
-class PdfExtractWorker(SandcrawlerFetchWorker):
- def __init__(self, wayback_client=None, sink=None, **kwargs):
+class PdfExtractWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
super().__init__(wayback_client=wayback_client)
self.wayback_client = wayback_client
self.sink = sink
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
- def timeout_response(self, task) -> Dict:
- default_key = task['sha1hex']
+ def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
error_msg="internal pdf-extract worker timeout",
@@ -423,13 +465,12 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
sha1hex=default_key,
)
- def process(self, record, key: Optional[str] = None):
- default_key = record['sha1hex']
-
+ def process(self, record: Any, key: Optional[str] = None) -> dict:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
result = process_pdf(blob)
result.source = record
@@ -437,18 +478,19 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
return result.to_pdftext_dict()
+
class PdfExtractBlobWorker(SandcrawlerWorker):
"""
This is sort of like PdfExtractWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__()
self.sink = sink
- self.thumbnail_sink = kwargs.get('thumbnail_sink')
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
- def process(self, blob, key: Optional[str] = None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
assert isinstance(blob, bytes)
@@ -458,4 +500,3 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
return result.to_pdftext_dict()
-
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 161dc9c..112df6a 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,18 +1,19 @@
-
import time
+from typing import Any, Dict, Optional
+
import requests
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
-
- def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- def classify_pdf(self, blob, mode="auto"):
+ def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
"""
Returns a dict with at least:
@@ -25,45 +26,43 @@ class PdfTrioClient(object):
appropriately; an optional `error_msg` may also be set. For some other
errors, like connection failure, an exception is raised.
"""
- assert blob
+ assert blob and type(blob) == bytes
try:
- pdftrio_response = requests.post(
+ pdftrio_response = self.http_session.post(
self.host_url + "/classify/research-pub/" + mode,
files={
- 'pdf_content': blob,
+ "pdf_content": blob,
},
timeout=60.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'pdftrio request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "pdftrio request (HTTP POST) timeout",
}
except requests.exceptions.ConnectionError:
# crude back-off
time.sleep(2.0)
return {
- 'status': 'error-connect',
- 'status_code': -2, # heritrix3 "HTTP connect" code
- 'error_msg': 'pdftrio request connection timout',
+ "status": "error-connect",
+ "status_code": -2, # heritrix3 "HTTP connect" code
+ "error_msg": "pdftrio request connection timeout",
}
- info = dict(
- status_code=pdftrio_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
if pdftrio_response.status_code == 200:
resp_json = pdftrio_response.json()
- assert 'ensemble_score' in resp_json
- assert 'status' in resp_json
- assert 'versions' in resp_json
+ assert "ensemble_score" in resp_json
+ assert "status" in resp_json
+ assert "versions" in resp_json
info.update(resp_json)
else:
- info['status'] = 'error'
+ info["status"] = "error"
# TODO: might return JSON with some info?
- info['_total_sec'] = pdftrio_response.elapsed.total_seconds()
+ info["_total_sec"] = pdftrio_response.elapsed.total_seconds()
return info
@@ -72,59 +71,72 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
This class is basically copied directly from GrobidWorker
"""
- def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
- super().__init__(wayback_client=wayback_client)
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs
+ ):
+ super().__init__(wayback_client=wayback_client, **kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
- def process(self, record, key=None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
- default_key = record['sha1hex']
fetch_sec = None
start = time.time()
fetch_result = self.fetch_blob(record)
fetch_sec = time.time() - start
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
- result['source'] = record
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob)
+ result["source"] = record
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
if fetch_sec:
- result['timing']['fetch_sec'] = fetch_sec
+ result["timing"]["fetch_sec"] = fetch_sec
return result
+
class PdfTrioBlobWorker(SandcrawlerWorker):
"""
This is sort of like PdfTrioWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
- super().__init__()
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs
+ ):
+ super().__init__(**kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
self.mode = mode
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
start_process = time.time()
if not blob:
return None
+ assert isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
return result
-
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 0fd54a4..f682572 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -1,4 +1,3 @@
-
"""
cdx
- read raw CDX, filter
@@ -20,106 +19,112 @@ grobid
"""
import os
-from typing import Optional, AnyStr
+import time
import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
+
+import psycopg2
+import requests
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.grobid import GrobidClient
+from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.pdfextract import PdfExtractResult
-from sandcrawler.html_ingest import HtmlMetaRow
+from sandcrawler.workers import SandcrawlerWorker
class PersistCdxWorker(SandcrawlerWorker):
-
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
# filter to full CDX lines, no liveweb
- cdx_batch = [r for r in batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ cdx_batch = [r for r in batch if r.get("warc_path") and ("/" in r["warc_path"])]
resp = self.db.insert_cdx(self.cur, cdx_batch)
if len(cdx_batch) < len(batch):
- self.counts['skip'] += len(batch) - len(cdx_batch)
- self.counts['insert-cdx'] += resp[0]
- self.counts['update-cdx'] += resp[1]
+ self.counts["skip"] += len(batch) - len(cdx_batch)
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
self.db.commit()
return []
-class PersistIngestFileResultWorker(SandcrawlerWorker):
- def __init__(self, db_url, **kwargs):
+class PersistIngestFileResultWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def request_to_row(self, raw):
+ def request_to_row(self, raw: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema
if there is a problem with conversion, return None
"""
# backwards compat hacks; transform request to look like current schema
- if raw.get('ingest_type') == 'file':
- raw['ingest_type'] = 'pdf'
- if (not raw.get('link_source')
- and raw.get('base_url')
- and raw.get('ext_ids', {}).get('doi')
- and raw['base_url'] == "https://doi.org/{}".format(raw['ext_ids']['doi'])):
+ if raw.get("ingest_type") == "file":
+ raw["ingest_type"] = "pdf"
+ if (
+ not raw.get("link_source")
+ and raw.get("base_url")
+ and raw.get("ext_ids", {}).get("doi")
+ and raw["base_url"] == "https://doi.org/{}".format(raw["ext_ids"]["doi"])
+ ):
# set link_source(_id) for old ingest requests
- raw['link_source'] = 'doi'
- raw['link_source_id'] = raw['ext_ids']['doi']
- if (not raw.get('link_source')
- and raw.get('ingest_request_source', '').startswith('savepapernow')
- and raw.get('fatcat', {}).get('release_ident')):
+ raw["link_source"] = "doi"
+ raw["link_source_id"] = raw["ext_ids"]["doi"]
+ if (
+ not raw.get("link_source")
+ and raw.get("ingest_request_source", "").startswith("savepapernow")
+ and raw.get("fatcat", {}).get("release_ident")
+ ):
# set link_source(_id) for old ingest requests
- raw['link_source'] = 'spn'
- raw['link_source_id'] = raw['fatcat']['release_ident']
+ raw["link_source"] = "spn"
+ raw["link_source_id"] = raw["fatcat"]["release_ident"]
- for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'):
- if not k in raw:
- self.counts['skip-request-fields'] += 1
+ for k in ("ingest_type", "base_url", "link_source", "link_source_id"):
+ if k not in raw:
+ self.counts["skip-request-fields"] += 1
return None
- if raw['ingest_type'] not in ('pdf', 'xml', 'html'):
- self.counts['skip-ingest-type'] += 1
+ if raw["ingest_type"] not in ("pdf", "xml", "html"):
+ self.counts["skip-ingest-type"] += 1
+ return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
return None
request = {
- 'ingest_type': raw['ingest_type'],
- 'base_url': raw['base_url'],
- 'link_source': raw['link_source'],
- 'link_source_id': raw['link_source_id'],
- 'ingest_request_source': raw.get('ingest_request_source'),
- 'request': {},
+ "ingest_type": raw["ingest_type"],
+ "base_url": raw["base_url"],
+ "link_source": raw["link_source"],
+ "link_source_id": raw["link_source_id"],
+ "ingest_request_source": raw.get("ingest_request_source"),
+ "request": {},
}
# extra/optional fields
- if raw.get('release_stage'):
- request['release_stage'] = raw['release_stage']
- if raw.get('fatcat', {}).get('release_ident'):
- request['request']['release_ident'] = raw['fatcat']['release_ident']
- for k in ('ext_ids', 'edit_extra', 'rel'):
+ if raw.get("release_stage"):
+ request["release_stage"] = raw["release_stage"]
+ if raw.get("fatcat", {}).get("release_ident"):
+ request["request"]["release_ident"] = raw["fatcat"]["release_ident"]
+ for k in ("ext_ids", "edit_extra", "rel"):
if raw.get(k):
- request['request'][k] = raw[k]
+ request["request"][k] = raw[k]
# if this dict is empty, trim it to save DB space
- if not request['request']:
- request['request'] = None
+ if not request["request"]:
+ request["request"] = None
return request
-
def file_result_to_row(self, raw: dict) -> Optional[dict]:
"""
@@ -127,208 +132,302 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if there is a problem with conversion, return None and set skip count
"""
- for k in ('request', 'hit', 'status'):
- if not k in raw:
- self.counts['skip-result-fields'] += 1
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ self.counts["skip-result-fields"] += 1
return None
- if not 'base_url' in raw['request']:
- self.counts['skip-result-fields'] += 1
+ if "base_url" not in raw["request"]:
+ self.counts["skip-result-fields"] += 1
return None
- ingest_type = raw['request'].get('ingest_type')
- if ingest_type == 'file':
- ingest_type = 'pdf'
- if ingest_type not in ('pdf', 'xml', 'html'):
- self.counts['skip-ingest-type'] += 1
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type == "file":
+ ingest_type = "pdf"
+ if ingest_type not in (
+ "pdf",
+ "xml",
+ "html",
+ "component",
+ "src",
+ "dataset",
+ "dataset-file",
+ ):
+ self.counts["skip-ingest-type"] += 1
return None
- if raw['status'] in ("existing", ):
- self.counts['skip-existing'] += 1
+ if raw["status"] in ("existing",):
+ self.counts["skip-existing"] += 1
return None
result = {
- 'ingest_type': ingest_type,
- 'base_url': raw['request']['base_url'],
- 'hit': raw['hit'],
- 'status': raw['status'],
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
}
- terminal = raw.get('terminal')
+ terminal = raw.get("terminal")
if terminal:
- result['terminal_url'] = terminal.get('terminal_url') or terminal.get('url')
- result['terminal_dt'] = terminal.get('terminal_dt')
- result['terminal_status_code'] = terminal.get('terminal_status_code') or terminal.get('status_code') or terminal.get('http_code')
- if result['terminal_status_code']:
- result['terminal_status_code'] = int(result['terminal_status_code'])
- result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
+ result["terminal_url"] = terminal.get("terminal_url") or terminal.get("url")
+ result["terminal_dt"] = terminal.get("terminal_dt")
+ result["terminal_status_code"] = (
+ terminal.get("terminal_status_code")
+ or terminal.get("status_code")
+ or terminal.get("http_code")
+ )
+ if result["terminal_status_code"]:
+ result["terminal_status_code"] = int(result["terminal_status_code"])
+ result["terminal_sha1hex"] = terminal.get("terminal_sha1hex")
+ if len(result["terminal_url"]) > 2048:
+ # postgresql13 doesn't like extremely large URLs in b-tree index
+ self.counts["skip-huge-url"] += 1
+ return None
return result
def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
- html_body = record.get('html_body')
- file_meta = record.get('file_meta')
+ html_body = record.get("html_body")
+ file_meta = record.get("file_meta")
if not (file_meta and html_body):
return None
return HtmlMetaRow(
sha1hex=file_meta["sha1hex"],
- status=record.get('status'),
- scope=record.get('scope'),
- has_teixml=bool(html_body and html_body['status'] == 'success'),
+ status=record.get("status"),
+ scope=record.get("scope"),
+ has_teixml=bool(html_body and html_body["status"] == "success"),
has_thumbnail=False, # TODO
- word_count=(html_body and html_body.get('word_count')) or None,
- biblio=record.get('html_biblio'),
- resources=record.get('html_resources'),
+ word_count=(html_body and html_body.get("word_count")) or None,
+ biblio=record.get("html_biblio"),
+ resources=record.get("html_resources"),
)
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def result_to_platform_row(self, raw: dict) -> Optional[dict]:
+ """
+ Converts fileset ingest-result JSON schema (eg, from Kafka) to SQL ingest_fileset_platform schema
+
+ if there is a problem with conversion, return None and set skip count
+ """
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ return None
+ if "base_url" not in raw["request"]:
+ return None
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type not in ("dataset"):
+ return None
+ if raw["status"] in ("existing",):
+ return None
+ if not raw.get("platform_name"):
+ return None
+ result = {
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
+ "platform_name": raw.get("platform_name"),
+ "platform_domain": raw.get("platform_domain"),
+ "platform_id": raw.get("platform_id"),
+ "ingest_strategy": raw.get("ingest_strategy"),
+ "total_size": raw.get("total_size"),
+ "file_count": raw.get("file_count"),
+ "archiveorg_item_name": raw.get("archiveorg_item_name"),
+ "archiveorg_item_bundle_path": None,
+ "web_bundle_url": None,
+ "web_bundle_dt": None,
+ "manifest": raw.get("manifest"),
+ }
+ if result.get("fileset_bundle"):
+ result["archiveorg_item_bundle_path"] = result["fileset_bundle"].get(
+ "archiveorg_item_bundle_path"
+ )
+ result["web_bundle_url"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_url")
+ )
+ result["web_bundle_dt"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_dt")
+ )
+ return result
+
+ def push_batch(self, batch: List[Any]) -> List[Any]:
+ self.counts["total"] += len(batch)
if not batch:
return []
- results = [self.file_result_to_row(raw) for raw in batch]
- results = [r for r in results if r]
+ results_unfiltered = [self.file_result_to_row(raw) for raw in batch]
+ results = [r for r in results_unfiltered if r]
- requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
- requests = [r for r in requests if r]
+ irequests_unfiltered = [
+ self.request_to_row(raw["request"]) for raw in batch if raw.get("request")
+ ]
+ irequests = [
+ r for r in irequests_unfiltered if r and r["ingest_type"] != "dataset-file"
+ ]
- if requests:
- resp = self.db.insert_ingest_request(self.cur, requests)
- self.counts['insert-requests'] += resp[0]
- self.counts['update-requests'] += resp[1]
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
if results:
resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update")
- self.counts['insert-results'] += resp[0]
- self.counts['update-results'] += resp[1]
+ self.counts["insert-results"] += resp[0]
+ self.counts["update-results"] += resp[1]
# these schemas match, so can just pass through
- cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx')]
- revisit_cdx_batch = [r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx')]
+ cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")]
+ revisit_cdx_batch = [
+ r["revisit_cdx"] for r in batch if r.get("hit") and r.get("revisit_cdx")
+ ]
cdx_batch.extend(revisit_cdx_batch)
# filter to full CDX lines, with full warc_paths (not liveweb)
- cdx_batch = [r for r in cdx_batch if r.get('warc_path') and ("/" in r['warc_path'])]
+ cdx_batch = [r for r in cdx_batch if r.get("warc_path") and ("/" in r["warc_path"])]
if cdx_batch:
resp = self.db.insert_cdx(self.cur, cdx_batch)
- self.counts['insert-cdx'] += resp[0]
- self.counts['update-cdx'] += resp[1]
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
- file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("hit") and r.get("file_meta")]
if file_meta_batch:
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing")
- self.counts['insert-file_meta'] += resp[0]
- self.counts['update-file_meta'] += resp[1]
+ self.counts["insert-file_meta"] += resp[0]
+ self.counts["update-file_meta"] += resp[1]
- html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')]
+ html_meta_batch = [
+ self.result_to_html_meta(r) for r in batch if r.get("hit") and r.get("html_body")
+ ]
if html_meta_batch:
- resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="update")
- self.counts['insert-html_meta'] += resp[0]
- self.counts['update-html_meta'] += resp[1]
+ rows = [d.to_sql_tuple() for d in html_meta_batch if d]
+ resp = self.db.insert_html_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-html_meta"] += resp[0]
+ self.counts["update-html_meta"] += resp[1]
+
+ fileset_platform_batch_all = [
+ self.result_to_platform_row(raw)
+ for raw in batch
+ if raw.get("request", {}).get("ingest_type") == "dataset"
+ and raw.get("platform_name")
+ ]
+ fileset_platform_batch: List[Dict] = [p for p in fileset_platform_batch_all if p]
+ if fileset_platform_batch:
+ resp = self.db.insert_ingest_fileset_platform(
+ self.cur, fileset_platform_batch, on_conflict="update"
+ )
+ self.counts["insert-fileset_platform"] += resp[0]
+ self.counts["update-fileset_platform"] += resp[1]
self.db.commit()
return []
-class PersistIngestRequestWorker(PersistIngestFileResultWorker):
- def __init__(self, db_url, **kwargs):
+class PersistIngestFilesetWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__(db_url=db_url)
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
if not batch:
return []
- requests = [self.request_to_row(raw) for raw in batch]
- requests = [r for r in requests if r]
+ irequests_all = [self.request_to_row(raw) for raw in batch]
+ irequests: List[Dict] = [r for r in irequests_all if r]
- if requests:
- resp = self.db.insert_ingest_request(self.cur, requests)
- self.counts['insert-requests'] += resp[0]
- self.counts['update-requests'] += resp[1]
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
self.db.commit()
return []
-class PersistGrobidWorker(SandcrawlerWorker):
- def __init__(self, db_url, **kwargs):
+class PersistGrobidWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.grobid = GrobidClient()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_only = kwargs.get('s3_only', False)
- self.db_only = kwargs.get('db_only', False)
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
# filter out bad "missing status_code" timeout rows
- missing = [r for r in batch if not r.get('status_code')]
+ missing = [r for r in batch if not r.get("status_code")]
if missing:
- self.counts['skip-missing-status'] += len(missing)
- batch = [r for r in batch if r.get('status_code')]
+ self.counts["skip-missing-status"] += len(missing)
+ batch = [r for r in batch if r.get("status_code")]
for r in batch:
- if r['status_code'] != 200 or not r.get('tei_xml'):
- self.counts['s3-skip-status'] += 1
- if r.get('error_msg'):
- r['metadata'] = {'error_msg': r['error_msg'][:500]}
+ if r["status_code"] != 200 or not r.get("tei_xml"):
+ self.counts["s3-skip-status"] += 1
+ if r.get("error_msg"):
+ r["metadata"] = {"error_msg": r["error_msg"][:500]}
continue
- assert len(r['key']) == 40
+ assert len(r["key"]) == 40
if not self.db_only:
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
+ blob=r["tei_xml"],
+ sha1hex=r["key"],
extension=".tei.xml",
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
- # enhance with teixml2json metadata, if available
+ # enhance with GROBID TEI-XML metadata, if available
try:
metadata = self.grobid.metadata(r)
except xml.etree.ElementTree.ParseError as xml_e:
- r['status'] = 'bad-grobid-xml'
- r['metadata'] = {'error_msg': str(xml_e)[:1024]}
+ r["status"] = "bad-grobid-xml"
+ r["metadata"] = {"error_msg": str(xml_e)[:1024]}
continue
if not metadata:
continue
- for k in ('fatcat_release', 'grobid_version'):
+ for k in ("fatcat_release", "grobid_version"):
r[k] = metadata.pop(k, None)
- if r.get('fatcat_release'):
- r['fatcat_release'] = r['fatcat_release'].replace('release_', '')
- if metadata.get('grobid_timestamp'):
- r['updated'] = metadata['grobid_timestamp']
- r['metadata'] = metadata
+ if r.get("fatcat_release"):
+ r["fatcat_release"] = r["fatcat_release"].replace("release_", "")
+ if metadata.get("grobid_timestamp"):
+ r["updated"] = metadata["grobid_timestamp"]
+ r["metadata"] = metadata
if not self.s3_only:
+ assert self.db and self.cur
resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
- self.counts['insert-grobid'] += resp[0]
- self.counts['update-grobid'] += resp[1]
+ self.counts["insert-grobid"] += resp[0]
+ self.counts["update-grobid"] += resp[1]
- file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("file_meta")]
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
@@ -342,11 +441,11 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
This could be refactored into a "Sink" type with an even thinner wrapper.
"""
- def __init__(self, output_dir):
+ def __init__(self, output_dir: str):
super().__init__()
self.output_dir = output_dir
- def _blob_path(self, sha1hex, extension=".tei.xml"):
+ def _blob_path(self, sha1hex: str, extension: str = ".tei.xml") -> str:
obj_path = "{}/{}/{}{}".format(
sha1hex[0:2],
sha1hex[2:4],
@@ -355,48 +454,49 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
)
return obj_path
- def process(self, record, key=None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
- if record.get('status_code') != 200 or not record.get('tei_xml'):
+ if record.get("status_code") != 200 or not record.get("tei_xml"):
return False
- assert(len(record['key'])) == 40
- p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))
+ assert (len(record["key"])) == 40
+ p = "{}/{}".format(self.output_dir, self._blob_path(record["key"]))
os.makedirs(os.path.dirname(p), exist_ok=True)
- with open(p, 'w') as f:
- f.write(record.pop('tei_xml'))
- self.counts['written'] += 1
+ with open(p, "w") as f:
+ f.write(record.pop("tei_xml"))
+ self.counts["written"] += 1
return record
class PersistPdfTrioWorker(SandcrawlerWorker):
-
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
- batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')]
+ batch = [r for r in batch if "pdf_trio" in r and r["pdf_trio"].get("status_code")]
for r in batch:
# copy key (sha1hex) into sub-object
- r['pdf_trio']['key'] = r['key']
- pdftrio_batch = [r['pdf_trio'] for r in batch]
+ r["pdf_trio"]["key"] = r["key"]
+ pdftrio_batch = [r["pdf_trio"] for r in batch]
resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
- self.counts['insert-pdftrio'] += resp[0]
- self.counts['update-pdftrio'] += resp[1]
-
- file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')]
+ self.counts["insert-pdftrio"] += resp[0]
+ self.counts["update-pdftrio"] += resp[1]
+
+ file_meta_batch = [
+ r["file_meta"]
+ for r in batch
+ if r["pdf_trio"]["status"] == "success" and r.get("file_meta")
+ ]
resp = self.db.insert_file_meta(self.cur, file_meta_batch)
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
return []
@@ -409,63 +509,63 @@ class PersistPdfTextWorker(SandcrawlerWorker):
Should keep batch sizes small.
"""
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_only = kwargs.get('s3_only', False)
- self.db_only = kwargs.get('db_only', False)
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
if not self.s3_only:
- self.db = SandcrawlerPostgresClient(db_url)
- self.cur = self.db.conn.cursor()
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
else:
self.db = None
self.cur = None
- def process(self, record, key=None):
- """
- Only do batches (as transactions)
- """
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
raise NotImplementedError
- def push_batch(self, batch):
- self.counts['total'] += len(batch)
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
parsed_batch = []
for r in batch:
parsed_batch.append(PdfExtractResult.from_pdftext_dict(r))
for r in parsed_batch:
- if r.status != 'success' or not r.text:
- self.counts['s3-skip-status'] += 1
+ if r.status != "success" or not r.text:
+ self.counts["s3-skip-status"] += 1
if r.error_msg:
- r.metadata = {'error_msg': r.error_msg[:500]}
+ r.metadata = {"error_msg": r.error_msg[:500]}
continue
assert len(r.sha1hex) == 40
if not self.db_only:
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder="text",
blob=r.text,
sha1hex=r.sha1hex,
extension=".txt",
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
if not self.s3_only:
- resp = self.db.insert_pdf_meta(self.cur, parsed_batch, on_conflict="update")
- self.counts['insert-pdf-meta'] += resp[0]
- self.counts['update-pdf-meta'] += resp[1]
+ assert self.db and self.cur
+ rows = [r.to_sql_tuple() for r in parsed_batch]
+ resp = self.db.insert_pdf_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-pdf-meta"] += resp[0]
+ self.counts["update-pdf-meta"] += resp[1]
file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta]
resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
- self.counts['insert-file-meta'] += resp[0]
- self.counts['update-file-meta'] += resp[1]
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
self.db.commit()
@@ -484,32 +584,33 @@ class PersistThumbnailWorker(SandcrawlerWorker):
def __init__(self, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_extension = kwargs.get('s3_extension', ".jpg")
- self.s3_folder = kwargs.get('s3_folder', "pdf")
+ self.s3_extension = kwargs.get("s3_extension", ".jpg")
+ self.s3_folder = kwargs.get("s3_folder", "pdf")
- def process(self, blob: bytes, key: Optional[str] = None):
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
"""
Processing raw messages, not decoded JSON objects
"""
+ assert isinstance(record, bytes)
+ blob: bytes = record
if isinstance(key, bytes):
- key = key.decode('utf-8')
+ key = key.decode("utf-8")
assert key is not None and len(key) == 40 and isinstance(key, str)
- assert isinstance(blob, bytes)
assert len(blob) >= 50
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
blob=blob,
sha1hex=key,
extension=self.s3_extension,
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
class GenericPersistDocWorker(SandcrawlerWorker):
@@ -522,36 +623,36 @@ class GenericPersistDocWorker(SandcrawlerWorker):
def __init__(self, **kwargs):
super().__init__()
self.s3 = SandcrawlerMinioClient(
- host_url=kwargs.get('s3_url', 'localhost:9000'),
- access_key=kwargs['s3_access_key'],
- secret_key=kwargs['s3_secret_key'],
- default_bucket=kwargs['s3_bucket'],
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
)
- self.s3_extension = kwargs.get('s3_extension', ".unknown")
- self.s3_folder = kwargs.get('s3_folder', "unknown")
+ self.s3_extension = kwargs.get("s3_extension", ".unknown")
+ self.s3_folder = kwargs.get("s3_folder", "unknown")
self.doc_key = "unknown"
- def process(self, record: dict, key: Optional[AnyStr] = None) -> None:
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
- if record.get('status') != 'success' or not record.get(self.doc_key):
+ if record.get("status") != "success" or not record.get(self.doc_key):
return
assert key is not None
if isinstance(key, bytes):
- key_str = key.decode('utf-8')
+ key_str = key.decode("utf-8")
elif isinstance(key, str):
key_str = key
assert len(key_str) == 40
- if 'sha1hex' in record:
- assert key_str == record['sha1hex']
+ if "sha1hex" in record:
+ assert key_str == record["sha1hex"]
- resp = self.s3.put_blob(
+ self.s3.put_blob(
folder=self.s3_folder,
- blob=record[self.doc_key].encode('utf-8'),
+ blob=record[self.doc_key].encode("utf-8"),
sha1hex=key_str,
extension=self.s3_extension,
)
- self.counts['s3-put'] += 1
+ self.counts["s3-put"] += 1
class PersistXmlDocWorker(GenericPersistDocWorker):
@@ -562,8 +663,8 @@ class PersistXmlDocWorker(GenericPersistDocWorker):
def __init__(self, **kwargs):
super().__init__(**kwargs)
- self.s3_extension = kwargs.get('s3_extension', ".jats.xml")
- self.s3_folder = kwargs.get('s3_folder', "xml_doc")
+ self.s3_extension = kwargs.get("s3_extension", ".jats.xml")
+ self.s3_folder = kwargs.get("s3_folder", "xml_doc")
self.doc_key = "jats_xml"
@@ -575,6 +676,110 @@ class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
def __init__(self, **kwargs):
super().__init__(**kwargs)
- self.s3_extension = kwargs.get('s3_extension', ".tei.xml")
- self.s3_folder = kwargs.get('s3_folder', "html_body")
+ self.s3_extension = kwargs.get("s3_extension", ".tei.xml")
+ self.s3_folder = kwargs.get("s3_folder", "html_body")
self.doc_key = "tei_xml"
+
+
+class PersistCrossrefWorker(SandcrawlerWorker):
+ """
+ Pushes Crossref API JSON records into postgresql. Can also talk to GROBID,
+ parsed 'unstructured' references, and push the results in to postgresql at
+ the same time.
+ """
+
+ def __init__(
+ self,
+ db_url: str,
+ grobid_client: Optional[GrobidClient],
+ parse_refs: bool = True,
+ **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ if grobid_client:
+ self.grobid_client = grobid_client
+ else:
+ self.grobid_client = GrobidClient()
+ self.parse_refs = parse_refs
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ crossref_batch = []
+ refs_batch = []
+ for record in batch:
+ crossref_batch.append(
+ dict(
+ doi=record["DOI"].lower().strip(),
+ indexed=record["indexed"]["date-time"],
+ record=record,
+ )
+ )
+ if self.parse_refs:
+ try:
+ parsed_refs = self.grobid_client.crossref_refs(record)
+ refs_batch.append(parsed_refs)
+ except (
+ xml.etree.ElementTree.ParseError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.ReadTimeout,
+ ):
+ print("GROBID crossref refs parsing error, skipping with a sleep")
+ time.sleep(3)
+ pass
+
+ resp = self.db.insert_crossref(self.cur, crossref_batch)
+ if len(crossref_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(crossref_batch)
+ self.counts["insert-crossref"] += resp[0]
+ self.counts["update-crossref"] += resp[1]
+
+ if refs_batch:
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistGrobidRefsWorker(SandcrawlerWorker):
+ """
+ Simple persist worker to backfill GROBID references in to postgresql
+ locally. Consumes the JSON output from GROBID CrossrefRefsWorker.
+ """
+
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ refs_batch = []
+ for record in batch:
+ assert record["source"]
+ assert record["source_id"]
+ refs_batch.append(record)
+
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..356f050 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,22 @@
-
-import sys
import json
-import time
+import multiprocessing.pool
import signal
+import sys
+import time
import zipfile
-import requests
-import multiprocessing.pool
from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
+from typing import Any, Dict, List, Optional, Sequence
-from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
+from confluent_kafka import Consumer, KafkaException, Producer
+
+from .ia import (
+ PetaboxError,
+ SandcrawlerBackoffError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .misc import parse_cdx_line, requests_retry_session
class SandcrawlerWorker(object):
@@ -21,31 +27,30 @@ class SandcrawlerWorker(object):
worker (pipeline-style), or defaults to stdout.
"""
- def __init__(self):
- self.counts = Counter()
- self.sink = None
- # TODO: self.counters
+ def __init__(self, sink: Optional["SandcrawlerWorker"] = None):
+ self.counts: Counter = Counter()
+ self.sink: Optional[SandcrawlerWorker] = sink
- def push_record(self, task, key=None):
- self.counts['total'] += 1
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if not self.want(task):
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
result = self.process(task, key=key)
if not result:
- self.counts['failed'] += 1
+ self.counts["failed"] += 1
return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return result
- def timeout_response(self, task):
+ def timeout_response(self, task: Any) -> Any:
"""
This should be overridden by workers that want to return something
meaningful when there is a processing timeout. Eg, JSON vs some other
@@ -53,7 +58,9 @@ class SandcrawlerWorker(object):
"""
return None
- def push_record_timeout(self, task, key=None, timeout=300):
+ def push_record_timeout(
+ self, task: Any, key: Optional[str] = None, timeout: int = 300
+ ) -> Any:
"""
A wrapper around self.push_record which sets a timeout.
@@ -62,49 +69,52 @@ class SandcrawlerWorker(object):
same process.
"""
- def timeout_handler(signum, frame):
+ def timeout_handler(signum: int, frame: Any) -> None:
raise TimeoutError("timeout processing record")
+
signal.signal(signal.SIGALRM, timeout_handler)
resp = None
signal.alarm(int(timeout))
try:
resp = self.push_record(task, key=key)
except TimeoutError:
- self.counts['timeout'] += 1
- resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+ self.counts["timeout"] += 1
+ resp = self.timeout_response(task) # pylint: disable=assignment-from-none
# TODO: what if it is this push_record() itself that is timing out?
if resp and self.sink:
self.sink.push_record(resp)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
elif resp:
print(json.dumps(resp))
finally:
signal.alarm(0)
return resp
- def push_batch(self, tasks):
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
results = []
for task in tasks:
results.append(self.push_record(task))
return results
- def finish(self):
+ def finish(self) -> Counter:
if self.sink:
self.sink.finish()
print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
- def want(self, task):
+ def want(self, task: Any) -> bool:
"""
Optionally override this as a filter in implementations.
"""
return True
- def process(self, task, key=None):
+ def process(self, task: Any, key: Optional[str] = None) -> Any:
"""
Derived workers need to implement business logic here.
+
+ TODO: should derived workers explicitly type-check the 'task' object?
"""
- raise NotImplementedError('implementation required')
+ raise NotImplementedError("implementation required")
class SandcrawlerFetchWorker(SandcrawlerWorker):
@@ -113,26 +123,26 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
PDFs) from wayback, archive.org, or other sources.
"""
- def __init__(self, wayback_client, **kwargs):
+ def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
super().__init__(**kwargs)
self.wayback_client = wayback_client
+ self.http_session = requests_retry_session()
- def fetch_blob(self, record):
- start_process = time.time()
- default_key = record['sha1hex']
+ def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = record["sha1hex"]
wayback_sec = None
petabox_sec = None
- if record.get('warc_path') and record.get('warc_offset'):
+ if record.get("warc_path") and record.get("warc_offset"):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
- raise Exception("wayback client not configured for this PdfTrioWorker")
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
- blob = self.wayback_client.fetch_petabox_body(
- csize=record['warc_csize'],
- offset=record['warc_offset'],
- warc_path=record['warc_path'],
+ blob: bytes = self.wayback_client.fetch_petabox_body(
+ csize=record["warc_csize"],
+ offset=record["warc_offset"],
+ warc_path=record["warc_path"],
)
wayback_sec = time.time() - start
except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
@@ -142,15 +152,15 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-wayback",
error_msg=str(we),
)
- elif record.get('url') and record.get('datetime'):
+ elif record.get("url") and record.get("datetime"):
# it's a partial CDX dict or something? fetch using WaybackClient
if not self.wayback_client:
- raise Exception("wayback client not configured for this PdfTrioWorker")
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
blob = self.wayback_client.fetch_replay_body(
- url=record['url'],
- datetime=record['datetime'],
+ url=record["url"],
+ datetime=record["datetime"],
)
wayback_sec = time.time() - start
except (WaybackError, WaybackContentError) as we:
@@ -160,14 +170,15 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-wayback",
error_msg=str(we),
)
- elif record.get('item') and record.get('path'):
+ elif record.get("item") and record.get("path"):
# it's petabox link; fetch via HTTP
start = time.time()
- resp = requests.get("https://archive.org/serve/{}/{}".format(
- record['item'], record['path']))
+ ia_resp = self.http_session.get(
+ "https://archive.org/serve/{}/{}".format(record["item"], record["path"])
+ )
petabox_sec = time.time() - start
try:
- resp.raise_for_status()
+ ia_resp.raise_for_status()
except Exception as e:
return dict(
key=default_key,
@@ -175,55 +186,67 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-petabox",
error_msg=str(e),
)
- blob = resp.content
+ blob = ia_resp.content
else:
- raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
+ raise ValueError(
+ "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed"
+ )
if not blob:
return dict(
key=default_key,
source=record,
status="empty-blob",
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
return dict(
key=default_key,
status="success",
source=record,
blob=blob,
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
)
-class MultiprocessWrapper(SandcrawlerWorker):
- def __init__(self, worker, sink, jobs=None):
+class MultiprocessWrapper(SandcrawlerWorker):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ sink: Optional[SandcrawlerWorker] = None,
+ jobs: Optional[int] = None,
+ ):
self.counts = Counter()
self.worker = worker
self.sink = sink
self.pool = multiprocessing.pool.Pool(jobs)
- def push_batch(self, tasks):
- self.counts['total'] += len(tasks)
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ self.counts["total"] += len(tasks)
print("... processing batch of: {}".format(len(tasks)), file=sys.stderr)
results = self.pool.map(self.worker.process, tasks)
for result in results:
if not result:
- self.counts['failed'] += 1
- return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ self.counts["failed"] += 1
+ return []
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return results
- def finish(self):
+ def finish(self) -> Counter:
self.pool.terminate()
if self.sink:
self.sink.finish()
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
- return worker_counts
+ return self.counts
+
class BlackholeSink(SandcrawlerWorker):
"""
@@ -232,73 +255,73 @@ class BlackholeSink(SandcrawlerWorker):
Useful for tests.
"""
- def push_record(self, task, key=None):
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
return
- def push_batch(self, tasks):
- return
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ return []
-class KafkaSink(SandcrawlerWorker):
- def __init__(self, kafka_hosts, produce_topic, **kwargs):
+class KafkaSink(SandcrawlerWorker):
+ def __init__(self, kafka_hosts: str, produce_topic: str, **kwargs):
self.sink = None
self.counts = Counter()
self.produce_topic = produce_topic
self.kafka_hosts = kafka_hosts
- config = self.producer_config({
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 30000000, # ~30 MBytes; broker is ~50 MBytes
- 'api.version.request': True,
- 'api.version.fallback.ms': 0,
- })
+ config = self.producer_config(
+ {
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 30000000, # ~30 MBytes; broker is ~50 MBytes
+ "api.version.request": True,
+ "api.version.fallback.ms": 0,
+ }
+ )
self.producer = Producer(config)
-
@staticmethod
- def _fail_fast(err, msg):
+ def _fail_fast(err: Any, msg: Any) -> None:
if err is not None:
print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: dict) -> dict:
config = kafka_config.copy()
- config.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'message.timeout.ms': 30000,
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
- def push_record(self, msg, key=None):
- self.counts['total'] += 1
+ def push_record(self, msg: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if type(msg) == dict:
- if not key and 'key' in msg:
- key = msg['key']
+ if not key and "key" in msg:
+ key = msg["key"]
msg = json.dumps(msg)
if type(msg) == str:
- msg = msg.encode('utf-8')
+ msg = msg.encode("utf-8")
assert type(msg) == bytes
- self.producer.produce(
- self.produce_topic,
- msg,
- key=key,
- on_delivery=self._fail_fast)
- self.counts['produced'] += 1
+ self.producer.produce(self.produce_topic, msg, key=key, on_delivery=self._fail_fast)
+ self.counts["produced"] += 1
# check for errors etc
self.producer.poll(0)
- def push_batch(self, msgs):
+ def push_batch(self, msgs: List[Any]) -> List[Any]:
for m in msgs:
self.push_record(m)
+ return []
- def finish(self):
+ def finish(self) -> Counter:
self.producer.flush()
return self.counts
@@ -308,19 +331,21 @@ class KafkaCompressSink(KafkaSink):
Variant of KafkaSink for large documents. Used for, eg, GROBID output.
"""
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: Dict[str, Any]) -> Dict[str, Any]:
config = kafka_config.copy()
- config.update({
- 'compression.codec': 'gzip',
- 'retry.backoff.ms': 250,
- 'linger.ms': 1000,
- 'batch.num.messages': 50,
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'message.timeout.ms': 30000,
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "compression.codec": "gzip",
+ "retry.backoff.ms": 250,
+ "linger.ms": 1000,
+ "batch.num.messages": 50,
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
@@ -330,11 +355,11 @@ class RecordPusher:
trivial interface, just wraps an importer and pushes records in to it.
"""
- def __init__(self, worker, **kwargs):
- self.counts = Counter()
- self.worker = worker
+ def __init__(self, worker: SandcrawlerWorker, **kwargs):
+ self.counts: Counter = Counter()
+ self.worker: SandcrawlerWorker = worker
- def run(self):
+ def run(self) -> Counter:
"""
This will look something like:
@@ -347,133 +372,140 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
-
- def __init__(self, worker, json_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, json_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.json_file = json_file
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.json_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
try:
record = json.loads(line)
except json.decoder.JSONDecodeError:
- self.counts['error-json-decode'] += 1
+ self.counts["error-json-decode"] += 1
continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class CdxLinePusher(RecordPusher):
-
- def __init__(self, worker, cdx_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, cdx_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.cdx_file = cdx_file
- self.filter_http_statuses = kwargs.get('filter_http_statuses', None)
- self.filter_mimetypes = kwargs.get('filter_mimetypes', None)
- self.allow_octet_stream = kwargs.get('allow_octet_stream', False)
- self.batch_size = kwargs.get('batch_size', None)
+ self.filter_http_statuses = kwargs.get("filter_http_statuses", None)
+ self.filter_mimetypes = kwargs.get("filter_mimetypes", None)
+ self.allow_octet_stream = kwargs.get("allow_octet_stream", False)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.cdx_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
record = parse_cdx_line(line, normalize=True)
if not record:
- self.counts['skip-parse'] += 1
+ self.counts["skip-parse"] += 1
continue
- if self.filter_http_statuses and record['http_status'] not in self.filter_http_statuses:
- self.counts['skip-http_status'] += 1
+ if (
+ self.filter_http_statuses
+ and record["http_status"] not in self.filter_http_statuses
+ ):
+ self.counts["skip-http_status"] += 1
continue
- if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes:
- self.counts['skip-mimetype'] += 1
+ if self.filter_mimetypes and record["mimetype"] not in self.filter_mimetypes:
+ self.counts["skip-mimetype"] += 1
continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class ZipfilePusher(RecordPusher):
-
- def __init__(self, worker, zipfile_path, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, zipfile_path: str, **kwargs):
self.counts = Counter()
self.worker = worker
self.filter_suffix = ".pdf"
self.zipfile_path = zipfile_path
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
- with zipfile.ZipFile(self.zipfile_path, 'r') as archive:
+ with zipfile.ZipFile(self.zipfile_path, "r") as archive:
for zipinfo in archive.infolist():
if not zipinfo.filename.endswith(self.filter_suffix):
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
# NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it
- flo = archive.open(zipinfo, 'r')
+ flo = archive.open(zipinfo, "r")
data = flo.read(2**32)
flo.close()
if self.batch_size:
batch.append(data)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(data)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
-class KafkaJsonPusher(RecordPusher):
- def __init__(self, worker, kafka_hosts, consume_topic, group, **kwargs):
+class KafkaJsonPusher(RecordPusher):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ kafka_hosts: str,
+ consume_topic: str,
+ group: str,
+ **kwargs
+ ):
self.counts = Counter()
self.worker = worker
self.consumer = make_kafka_consumer(
@@ -481,29 +513,32 @@ class KafkaJsonPusher(RecordPusher):
consume_topic,
group,
)
- self.push_batches = kwargs.get('push_batches', False)
- self.raw_records = kwargs.get('raw_records', False)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.batch_size = kwargs.get('batch_size', 100)
+ self.push_batches = kwargs.get("push_batches", False)
+ self.raw_records = kwargs.get("raw_records", False)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.batch_size = kwargs.get("batch_size", 100)
if self.batch_size in (0, 1):
self.batch_size = 1
- self.batch_worker = kwargs.get('batch_worker', False)
- self.process_timeout_sec = kwargs.get('process_timeout_sec', 300)
+ self.batch_worker = kwargs.get("batch_worker", False)
+ self.process_timeout_sec = kwargs.get("process_timeout_sec", 300)
- def run(self):
+ def run(self) -> Counter:
while True:
# TODO: this is batch-oriented, because underlying worker is
# often batch-oriented, but this doesn't confirm that entire batch
- # has been pushed to fatcat before commiting offset. Eg, consider
+ # has been pushed to fatcat before committing offset. Eg, consider
# case where there there is one update and thousands of creates;
# update would be lingering in worker, and if worker crashed
# never created. Not great.
batch = self.consumer.consume(
- num_messages=self.batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval)".format(
- len(batch), self.poll_interval),
- file=sys.stderr)
+ num_messages=self.batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval)".format(
+ len(batch), self.poll_interval
+ ),
+ file=sys.stderr,
+ )
if not batch:
# TODO: could have some larger timeout here and
# self.worker.finish() if it's been more than, eg, a couple
@@ -515,14 +550,14 @@ class KafkaJsonPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
if self.push_batches:
- self.counts['total'] += len(batch)
- records = [json.loads(msg.value().decode('utf-8')) for msg in batch]
+ self.counts["total"] += len(batch)
+ records = [json.loads(msg.value().decode("utf-8")) for msg in batch]
self.worker.push_batch(records)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
else:
for msg in batch:
- self.counts['total'] += 1
+ self.counts["total"] += 1
if self.raw_records:
# In this mode, pass the Kafka message as bytes through
# without decoding as JSON. Eg, for thumbnails (where
@@ -530,7 +565,7 @@ class KafkaJsonPusher(RecordPusher):
# from the message)
record = msg.value()
else:
- record = json.loads(msg.value().decode('utf-8'))
+ record = json.loads(msg.value().decode("utf-8"))
# This complex bit of code implements backoff/backpressure
# in a way that will not cause this Kafka consumer to lose
# partition assignments (resulting in a rebalance). This
@@ -540,7 +575,9 @@ class KafkaJsonPusher(RecordPusher):
while not done:
try:
# use timeouts; don't want kafka itself to timeout
- self.worker.push_record_timeout(record, key=msg.key(), timeout=self.process_timeout_sec)
+ self.worker.push_record_timeout(
+ record, key=msg.key(), timeout=self.process_timeout_sec
+ )
break
except SandcrawlerBackoffError as be:
print("Backing off for 200 seconds: {}".format(be))
@@ -552,8 +589,8 @@ class KafkaJsonPusher(RecordPusher):
assert not empty_batch
time.sleep(5)
self.consumer.resume(self.consumer.assignment())
- self.counts['pushed'] += 1
- if self.counts['total'] % 500 == 0:
+ self.counts["pushed"] += 1
+ if self.counts["total"] % 500 == 0:
print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
for msg in batch:
# locally store offsets of processed messages; will be
@@ -562,16 +599,16 @@ class KafkaJsonPusher(RecordPusher):
# TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
# commit the current batch if it has been lingering
- worker_counts = self.worker.finish()
+ self.worker.finish()
print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
self.consumer.close()
return self.counts
-def make_kafka_consumer(hosts, consume_topic, group):
+def make_kafka_consumer(hosts: str, consume_topic: str, group: str) -> Consumer:
topic_name = consume_topic
- def fail_fast(err, partitions):
+ def fail_fast(err: Any, partitions: List[Any]) -> None:
if err is not None:
print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
print("Bailing out...", file=sys.stderr)
@@ -584,40 +621,41 @@ def make_kafka_consumer(hosts, consume_topic, group):
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(p.error)
- #print("Kafka consumer commit successful")
+ # print("Kafka consumer commit successful")
pass
# previously, using pykafka
- #auto_commit_enable=True,
- #auto_commit_interval_ms=30000, # 30 seconds
+ # auto_commit_enable=True,
+ # auto_commit_interval_ms=30000, # 30 seconds
conf = {
- 'bootstrap.servers': hosts,
- 'group.id': group,
- 'on_commit': fail_fast,
+ "bootstrap.servers": hosts,
+ "group.id": group,
+ "on_commit": fail_fast,
# messages don't have offset marked as stored until processed,
# but we do auto-commit stored offsets to broker
- 'enable.auto.offset.store': False,
- 'enable.auto.commit': True,
+ "enable.auto.offset.store": False,
+ "enable.auto.commit": True,
# user code timeout; if no poll after this long, assume user code
# hung and rebalance (default: 6min)
- 'max.poll.interval.ms': 360000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
+ "max.poll.interval.ms": 360000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
},
}
- def on_rebalance(consumer, partitions):
+ def on_rebalance(consumer: Any, partitions: List[Any]) -> None:
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions),
- file=sys.stderr)
+ print(
+ "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr
+ )
consumer = Consumer(conf)
# NOTE: it's actually important that topic_name *not* be bytes (UTF-8
# encoded)
- consumer.subscribe([topic_name],
+ consumer.subscribe(
+ [topic_name],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
index 7a0086d..83d53d4 100644
--- a/python/sandcrawler/xml.py
+++ b/python/sandcrawler/xml.py
@@ -1,4 +1,3 @@
-
import xml.etree.ElementTree as ET
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 6be8bac..aebcbe1 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -1,26 +1,23 @@
#!/usr/bin/env python3
-
"""
These are generally for continuously running workers that consume from Kafka.
Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
or S3 (SeaweedFS).
"""
+import argparse
import os
+import subprocess
import sys
-import argparse
-import datetime
-import raven
-from sandcrawler import *
-from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
+import sentry_sdk
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-try:
- git_sha = raven.fetch_git_sha('..')
-except Exception as e:
- git_sha = None
-sentry_client = raven.Client(release=git_sha)
+from sandcrawler import *
+from sandcrawler.persist import (
+ PersistCrossrefWorker,
+ PersistHtmlTeiXmlWorker,
+ PersistXmlDocWorker,
+)
def run_grobid_extract(args):
@@ -50,6 +47,7 @@ def run_grobid_extract(args):
)
pusher.run()
+
def run_pdf_extract(args):
consume_topic = "sandcrawler-{}.unextracted".format(args.env)
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
@@ -80,6 +78,7 @@ def run_pdf_extract(args):
)
pusher.run()
+
def run_persist_grobid(args):
consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
worker = PersistGrobidWorker(
@@ -94,6 +93,8 @@ def run_persist_grobid(args):
kafka_group = "persist-grobid"
if args.s3_only:
kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
@@ -104,6 +105,7 @@ def run_persist_grobid(args):
)
pusher.run()
+
def run_persist_pdftext(args):
consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
worker = PersistPdfTextWorker(
@@ -118,6 +120,8 @@ def run_persist_pdftext(args):
kafka_group = "persist-pdf-text"
if args.s3_only:
kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
@@ -128,6 +132,7 @@ def run_persist_pdftext(args):
)
pusher.run()
+
def run_persist_thumbnail(args):
consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
worker = PersistThumbnailWorker(
@@ -138,17 +143,21 @@ def run_persist_thumbnail(args):
s3_extension=".180px.jpg",
s3_folder="pdf",
)
+ kafka_group = "persist-pdf-thumbnail"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-pdf-thumbnail",
+ group=kafka_group,
push_batches=False,
raw_records=True,
batch_size=25,
)
pusher.run()
+
def run_persist_xml_doc(args: argparse.Namespace) -> None:
consume_topic = f"sandcrawler-{args.env}.xml-doc"
worker = PersistXmlDocWorker(
@@ -157,16 +166,20 @@ def run_persist_xml_doc(args: argparse.Namespace) -> None:
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
)
+ kafka_group = "persist-xml-doc"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-xml-doc",
+ group=kafka_group,
push_batches=False,
batch_size=25,
)
pusher.run()
+
def run_persist_html_teixml(args: argparse.Namespace) -> None:
consume_topic = f"sandcrawler-{args.env}.html-teixml"
worker = PersistHtmlTeiXmlWorker(
@@ -175,16 +188,20 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None:
s3_access_key=args.s3_access_key,
s3_secret_key=args.s3_secret_key,
)
+ kafka_group = "persist-html-teixml"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-html-teixml",
+ group=kafka_group,
push_batches=False,
batch_size=25,
)
pusher.run()
+
def run_persist_pdftrio(args):
consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
worker = PersistPdfTrioWorker(
@@ -200,13 +217,20 @@ def run_persist_pdftrio(args):
)
pusher.run()
+
def run_ingest_file(args):
+ spn_cdx_retry_sec = 9.0
if args.bulk:
consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env)
consume_topic = "sandcrawler-{}.ingest-file-requests-bulk".format(args.env)
+ elif args.priority:
+ spn_cdx_retry_sec = 45.0
+ consume_group = "sandcrawler-{}-ingest-file-priority".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-priority".format(args.env)
else:
+ spn_cdx_retry_sec = 1.0
consume_group = "sandcrawler-{}-ingest-file".format(args.env)
- consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
@@ -248,8 +272,9 @@ def run_ingest_file(args):
pdftext_sink=pdftext_sink,
xmldoc_sink=xmldoc_sink,
htmlteixml_sink=htmlteixml_sink,
- # don't SPNv2 for --bulk backfill
- try_spn2=not args.bulk,
+ # don't SPNv2 for --bulk or --skip-spn
+ try_spn2=not (args.bulk or args.skip_spn),
+ spn_cdx_retry_sec=spn_cdx_retry_sec,
)
pusher = KafkaJsonPusher(
worker=worker,
@@ -260,6 +285,7 @@ def run_ingest_file(args):
)
pusher.run()
+
def run_persist_ingest_file(args):
consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
worker = PersistIngestFileResultWorker(
@@ -275,96 +301,195 @@ def run_persist_ingest_file(args):
)
pusher.run()
+
+def run_persist_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ consume_topic = "fatcat-{}.api-crossref".format(args.env)
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-crossref",
+ push_batches=True,
+ # small batch size because doing GROBID processing
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
- parser.add_argument('--db-url',
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--kafka-group-suffix", default="", help="Kafka consumer group suffix (optional)"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ parser.add_argument(
+ "--db-url",
help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (seaweedfs) backend URL",
- default="localhost:9000")
- parser.add_argument('--s3-access-key',
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
- parser.add_argument('--s3-secret-key',
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))
- parser.add_argument('--s3-bucket',
- help="S3 (seaweedfs) bucket to persist into",
- default="sandcrawler-dev")
+ default=os.environ.get("SANDCRAWLER_BLOB_SECRET_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
subparsers = parser.add_subparsers()
- sub_grobid_extract = subparsers.add_parser('grobid-extract',
- help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka")
+ sub_grobid_extract = subparsers.add_parser(
+ "grobid-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka",
+ )
sub_grobid_extract.set_defaults(func=run_grobid_extract)
- sub_pdf_extract = subparsers.add_parser('pdf-extract',
- help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka")
+ sub_pdf_extract = subparsers.add_parser(
+ "pdf-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka",
+ )
sub_pdf_extract.set_defaults(func=run_pdf_extract)
- sub_persist_grobid = subparsers.add_parser('persist-grobid',
- help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")
- sub_persist_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_persist_grobid.add_argument('--db-only',
- action='store_true',
- help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_grobid = subparsers.add_parser(
+ "persist-grobid",
+ help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
sub_persist_grobid.set_defaults(func=run_persist_grobid)
- sub_persist_pdftext = subparsers.add_parser('persist-pdftext',
- help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")
- sub_persist_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_persist_pdftext.add_argument('--db-only',
- action='store_true',
- help="only write status to database (don't upload TEI-XML to S3)")
+ sub_persist_pdftext = subparsers.add_parser(
+ "persist-pdftext",
+ help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
- sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail',
- help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
+ sub_persist_thumbnail = subparsers.add_parser(
+ "persist-thumbnail",
+ help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
- sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc',
- help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_xml_doc = subparsers.add_parser(
+ "persist-xml-doc",
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
- sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml',
- help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_html_teixml = subparsers.add_parser(
+ "persist-html-teixml",
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
- sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
- help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
+ sub_persist_pdftrio = subparsers.add_parser(
+ "persist-pdftrio",
+ help="daemon that consumes pdftrio output from Kafka and pushes to postgres",
+ )
sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
- sub_ingest_file = subparsers.add_parser('ingest-file',
- help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
- sub_ingest_file.add_argument('--bulk',
- action='store_true',
- help="consume from bulk kafka topic (eg, for ingest backfill)")
+ sub_ingest_file = subparsers.add_parser(
+ "ingest-file",
+ help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka",
+ )
+ sub_ingest_file.add_argument(
+ "--bulk",
+ action="store_true",
+ help="consume from bulk kafka topic (eg, for ingest backfill)",
+ )
+ sub_ingest_file.add_argument(
+ "--skip-spn",
+ action="store_true",
+ help="don't do SPN lookups",
+ )
+ sub_ingest_file.add_argument(
+ "--priority",
+ action="store_true",
+ help="consume from priority kafka topic (eg, for SPN requests)",
+ )
sub_ingest_file.set_defaults(func=run_ingest_file)
- sub_persist_ingest_file = subparsers.add_parser('persist-ingest-file',
- help="daemon that consumes ingest-file output from Kafka and pushes to postgres")
+ sub_persist_ingest_file = subparsers.add_parser(
+ "persist-ingest-file",
+ help="daemon that consumes ingest-file output from Kafka and pushes to postgres",
+ )
sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file)
+ sub_persist_crossref = subparsers.add_parser(
+ "persist-crossref",
+ help="daemon that persists crossref to postgres; also does GROBID ref transform",
+ )
+ sub_persist_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_persist_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+ sub_persist_crossref.set_defaults(func=run_persist_crossref)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..4561541 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is intended to be used for backfill ingest of old crawls. It can
also be used as a fast path for getting freshly crawled content into fatcat if
@@ -12,9 +11,9 @@ Run like:
Can then run through requests using that tool, or dump into kafka queue.
"""
-import sys
-import json
import argparse
+import json
+import sys
def run(args):
@@ -22,51 +21,54 @@ def run(args):
if not l.strip():
continue
row = json.loads(l)
- if not row['hit']:
+ if not row["hit"]:
continue
request = {
- 'base_url': row['final_url'],
- 'ingest_type': args.ingest_type,
- 'link_source': args.link_source,
- 'link_source_id': row['identifier'],
- 'ingest_request_source': args.ingest_request_source,
- 'ext_ids': {
- args.extid_type: row['identifier'],
+ "base_url": row["final_url"],
+ "ingest_type": args.ingest_type,
+ "link_source": args.link_source,
+ "link_source_id": row["identifier"],
+ "ingest_request_source": args.ingest_request_source,
+ "ext_ids": {
+ args.extid_type: row["identifier"],
},
}
if args.release_stage:
- assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
- request['release_stage'] = args.release_stage
+ assert args.release_stage in (
+ "published",
+ "submitted",
+ "accepted",
+ "draft",
+ "update",
+ )
+ request["release_stage"] = args.release_stage
print("{}".format(json.dumps(request, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--link-source',
- required=True,
- help="link_source to include in request")
- parser.add_argument('--extid-type',
- required=True,
- help="extid to encode identifier as")
- parser.add_argument('--ingest-type',
- default="pdf",
- help="ingest type (pdf, html, xml, etc)")
- parser.add_argument('--ingest-request-source',
- default="arabesque",
- help="to include in request")
- parser.add_argument('--release-stage',
- default=None,
- help="to include in request")
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--link-source", required=True, help="link_source to include in request"
+ )
+ parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+ parser.add_argument(
+ "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+ )
+ parser.add_argument(
+ "--ingest-request-source", default="arabesque", help="to include in request"
+ )
+ parser.add_argument("--release-stage", default=None, help="to include in request")
+ parser.add_argument(
+ "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..6328f52
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Helper script to
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import json
+import sys
+from typing import Any
+
+import internetarchive
+
+FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+}
+
+
+def want_file(f: dict, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+
+def parse_file(f: dict) -> dict:
+ """
+ Takes an IA API file and turns it in to a fatcat fileset manifest file
+ """
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = {
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
+ }
+ # TODO: will disable this hard check eventually and replace with:
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ mimetype = FORMAT_TO_MIMETYPE[f.format]
+ if mimetype:
+ mf["extra"] = dict(mimetype=mimetype)
+ return mf
+
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+ print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+ if release_id.startswith("release_"):
+ release_id = release_id[9:]
+ assert len(release_id) == 26
+ item = session.get_item(item_name)
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+ fileset = {
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
+ }
+ print(json.dumps(fileset))
+ return fileset
+
+
+def main():
+ session = internetarchive.get_session()
+ if len(sys.argv) == 3:
+ item_name = sys.argv[1]
+ release_id = sys.argv[2]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+ else:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ fields = line.split("\t")
+ assert len(fields) == 2
+ item_name = fields[0]
+ release_id = fields[1]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..0b60da3
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import internetarchive as ia
+import requests
+
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item["identifier"]
+ # TODO: error handling
+ try:
+ ret = ia.download(
+ item,
+ files=[item + ".cdx.gz"],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000,
+ )
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
+
+ # Move and cleanup
+ shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
+
+ print("Done!")
+
+
+if __name__ == "__main__":
+ run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..e3bf4f0 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
@@ -18,38 +18,44 @@ def canon(s):
def transform_cnki(obj):
requests = []
- assert obj['cnki_id']
-
+ assert obj["cnki_id"]
requests = []
- requests.append({
- 'base_url': canon(obj['info_url']),
- 'ingest_type': 'pdf',
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
- if 'read_url' in obj:
- requests.append({
- 'base_url': canon(obj['read_url']),
- 'ingest_type': 'pdf', # actually HTML
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
+ requests.append(
+ {
+ "base_url": canon(obj["info_url"]),
+ "ingest_type": "pdf",
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+ if "read_url" in obj:
+ requests.append(
+ {
+ "base_url": canon(obj["read_url"]),
+ "ingest_type": "pdf", # actually HTML
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
return requests
+
def transform_wanfang(obj):
- assert obj['wanfang_id']
- return [{
- 'base_url': canon(obj['url']),
- 'ingest_type': 'pdf',
- 'link_source': 'wanfang_covid19',
- 'link_source_id': obj['wanfang_id'],
- 'ingest_request_source': 'scrape-covid19',
- }]
+ assert obj["wanfang_id"]
+ return [
+ {
+ "base_url": canon(obj["url"]),
+ "ingest_type": "pdf",
+ "link_source": "wanfang_covid19",
+ "link_source_id": obj["wanfang_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ ]
def run(args):
@@ -58,26 +64,27 @@ def run(args):
continue
row = json.loads(l)
- if 'wanfang_id' in row:
+ if "wanfang_id" in row:
requests = transform_wanfang(row) or []
- elif 'cnki_id' in row:
+ elif "cnki_id" in row:
requests = transform_cnki(row) or []
else:
continue
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="COVID-19 metadata file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,23 +19,20 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -45,81 +42,80 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
- self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
- self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+ self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def run(self, dump_file):
sys.stderr.write("Starting...\n")
for line in dump_file:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
if len(sha1_hex) != 40:
sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
- tei_xml = grobid.get('tei_xml')
+ tei_xml = grobid.get("tei_xml")
if not tei_xml:
print("{}\tskip empty".format(sha1_hex))
- self.count['skip-empty'] += 1
+ self.count["skip-empty"] += 1
continue
- tei_xml = tei_xml.encode('utf-8')
+ tei_xml = tei_xml.encode("utf-8")
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
Body=tei_xml,
StorageClass=self.s3_storage_class,
)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="grobid/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".tei.xml",
- help='file suffix for created objects')
- parser.add_argument('--s3-storage-class',
- type=str,
- default="STANDARD",
- help='AWS S3 storage class (redundancy) to use')
- parser.add_argument('dump_file',
- help="TSV/JSON dump file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix",
+ type=str,
+ default="grobid/",
+ help="key prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--s3-storage-class",
+ type=str,
+ default="STANDARD",
+ help="AWS S3 storage class (redundancy) to use",
+ )
+ parser.add_argument(
+ "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..093f32a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbDisk:
-
def __init__(self, disk_dir, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.disk_dir = disk_dir
- self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
- self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+ self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+ self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Ensuring all 65536 base directories exist...\n")
for i in range(256):
for j in range(256):
- fpath = "{}/{}{:02x}/{:02x}".format(
- self.disk_dir,
- self.disk_prefix,
- i,
- j)
+ fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
os.makedirs(fpath, exist_ok=True)
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# save to disk
fpath = "{}/{}{}/{}/{}{}".format(
- self.disk_dir,
- self.disk_prefix,
- sha1_hex[0:2],
- sha1_hex[2:4],
- sha1_hex,
- self.disk_suffix)
- with open(fpath, 'wb') as f:
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix,
+ )
+ with open(fpath, "wb") as f:
f.write(blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
- self.count['success-disk'] += 1
+ self.count["success-disk"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--disk-dir',
- required=True,
- type=str,
- help='local base directory to save into')
- parser.add_argument('--disk-prefix',
- type=str,
- default="pdf/",
- help='directory prefix for items created in bucket')
- parser.add_argument('--disk-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created files')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--disk-dir", required=True, type=str, help="local base directory to save into"
+ )
+ parser.add_argument(
+ "--disk-prefix",
+ type=str,
+ default="pdf/",
+ help="directory prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..6f37ede 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -33,152 +33,180 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import boto3
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbS3:
-
def __init__(self, s3_bucket, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
- self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
- Body=blob)
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+ Body=blob,
+ )
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="pdf/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created objects')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index b981ab6..aef5c12 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an DOAJ article dump (JSON) into ingest requests.
@@ -9,31 +8,31 @@ in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""
-import sys
-import json
import argparse
+import json
+import sys
+from typing import List, Optional
+
import urlcanon
-from typing import Optional, List
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- #"semanticscholar.org/",
+ # "semanticscholar.org/",
"://doi.org/",
+ "://dx.doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
".archive.org/",
-
# large publishers/platforms; may remove in the future
- #"://link.springer.com/",
- #"://dergipark.gov.tr/",
- #"frontiersin.org/",
- #"scielo",
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
]
# these default to PDF; note that we also do pdf ingests for HTML pages
@@ -41,78 +40,83 @@ CONTENT_TYPE_MAP = {
"abstract": [],
"doc": [],
"": ["pdf"],
-
"doi": ["pdf"],
"url": ["pdf"],
"fulltext": ["pdf"],
"anySimpleType": ["pdf"],
-
"application/pdf": ["pdf"],
"html": ["html", "pdf"],
"text/html": ["html", "pdf"],
"xml": ["xml"],
}
+
def canon(s: str) -> str:
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj: dict) -> List[dict]:
"""
Transforms from a single DOAJ object to zero or more ingest requests.
Returns a list of dicts.
"""
- doaj_id = obj['id'].lower()
+ doaj_id = obj["id"].lower()
assert doaj_id
- bibjson = obj['bibjson']
- if not bibjson['link']:
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
return []
requests = []
doi: Optional[str] = None
- for ident in (bibjson['identifier'] or []):
- if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
- doi = ident['id'].lower()
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
- for link in (bibjson['link'] or []):
- if link.get('type') != "fulltext" or not link.get('url'):
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
continue
- ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
if not ingest_types:
continue
+
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in link['url'].lower():
+ if domain in link["url"].lower():
skip = True
if skip:
continue
try:
- base_url = canon(link['url'])
+ base_url = canon(link["url"].strip())
except UnicodeEncodeError:
continue
+ if not base_url or len(base_url) > 1000:
+ continue
+
for ingest_type in ingest_types:
request = {
- 'base_url': base_url,
- 'ingest_type': ingest_type,
- 'link_source': 'doaj',
- 'link_source_id': doaj_id,
- 'ingest_request_source': 'doaj',
- 'release_stage': 'published',
- 'rel': 'publisher',
- 'ext_ids': {
- 'doi': doi,
- 'doaj': doaj_id,
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
},
- 'edit_extra': {},
+ "edit_extra": {},
}
requests.append(request)
return requests
+
def run(args) -> None:
for l in args.json_file:
if not l.strip():
@@ -123,17 +127,18 @@ def run(args) -> None:
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main() -> None:
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="DOAJ article dump file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the
No dependencies (only python3 stdlib)
"""
-import sys
-import json
import base64
+import json
+import sys
+
def run():
for line in sys.stdin:
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 5
- raw_sha1 = line[0].replace('sha1:', '')
+ raw_sha1 = line[0].replace("sha1:", "")
dois = json.loads(line[1])
cdx = json.loads(line[2])
mimetype = line[3]
size = int(line[4])
- sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
obj = dict(
sha1=sha1,
dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
size=size,
- mimetype=mimetype)
+ mimetype=mimetype,
+ )
print(json.dumps(obj))
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py
new file mode 100755
index 0000000..2eb56cb
--- /dev/null
+++ b/python/scripts/fetch_cdx_sha1hex.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+"""
+This is a helper script to take fatcat file entities with partial metadata (eg,
+missing SHA256) and try to find one or more CDX record where the file may be
+found in wayback.
+
+This script uses the sandcrawler library and should be run like:
+
+ head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json
+"""
+
+import base64
+import json
+import sys
+from typing import List, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+from sandcrawler.ia import CdxApiClient, cdx_to_dict
+
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: requests.Session = None,
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+
+def b32_hex(s: str) -> str:
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+
+def get_db_cdx(sha1hex: str, http_session) -> List[dict]:
+ resp = http_session.get(
+ SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex)
+ )
+ resp.raise_for_status()
+ rows = resp.json()
+ return rows or []
+
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+
+def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]:
+
+ params = {
+ "url": url,
+ "output": "json",
+ "matchType": "exact",
+ "limit": 20,
+ # TODO: group-by digest/checksum?
+ # can't filter status because might be warc/revisit
+ # "filter": "statuscode:200",
+ }
+ rows = cdx_api._query_api(params)
+ if not rows:
+ return None
+ for row in rows:
+ if row.sha1hex == sha1hex:
+ return row
+ return None
+
+
+def process_file(fe, session, cdx_api) -> dict:
+ status = "unknown"
+
+ # simple CDX db lookup first
+ cdx_row_list = get_db_cdx(fe["sha1"], http_session=session)
+ if cdx_row_list:
+ return dict(
+ file_entity=fe,
+ cdx_rows=cdx_row_list,
+ status="success-db",
+ )
+
+ original_urls = []
+ for pair in fe["urls"]:
+ u = pair["url"]
+ if not "://web.archive.org/web/" in u:
+ continue
+ seg = u.split("/")
+ assert seg[2] == "web.archive.org"
+ assert seg[3] == "web"
+ if not seg[4].isdigit():
+ continue
+ original_url = "/".join(seg[5:])
+ original_urls.append(original_url)
+
+ if len(original_urls) == 0:
+ return dict(file_entity=fe, status="skip-no-urls")
+
+ found_cdx_rows = []
+ for url in list(set(original_urls)):
+
+ cdx_record = None
+ try:
+ cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api)
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 403:
+ return dict(file_entity=fe, status="fail-cdx-403")
+ else:
+ raise
+ if cdx_record and cdx_record.sha1hex == fe["sha1"]:
+ found_cdx_rows.append(cdx_to_dict(cdx_record))
+
+ if found_cdx_rows:
+ return dict(
+ file_entity=fe,
+ cdx_rows=found_cdx_rows,
+ status="success-api",
+ )
+
+ return dict(
+ file_entity=fe,
+ status="fail-not-found",
+ )
+
+
+def main():
+ session = requests_retry_session()
+ session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+ }
+ )
+ cdx_api = CdxApiClient()
+ for line in sys.stdin:
+ if not line.strip():
+ continue
+ fe = json.loads(line)
+ print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api)))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index dc4bea7..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,44 +1,49 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
-with open('title_slug_denylist.txt', 'r') as f:
+with open("title_slug_denylist.txt", "r") as f:
TITLE_DENYLIST = [l.strip() for l in f]
-TITLE_DENYLIST.extend((
- 'editorial',
- 'advertisement',
- 'bookreviews',
- 'reviews',
- 'nr',
- 'abstractoriginalarticle',
- 'originalarticle',
- 'impactfactor',
- 'articlenumber',
-))
+TITLE_DENYLIST.extend(
+ (
+ "editorial",
+ "advertisement",
+ "bookreviews",
+ "reviews",
+ "nr",
+ "abstractoriginalarticle",
+ "originalarticle",
+ "impactfactor",
+ "articlenumber",
+ )
+)
# The full name can't *entirely* be one of these
NAME_DENYLIST = (
- 'phd',
- 'phdstudent',
+ "phd",
+ "phdstudent",
)
+
def tokenize(s, remove_whitespace=True):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+ return s.encode("ascii", "replace").decode("utf8").replace("?", "")
+
assert tokenize("Impact Factor: 2.114") == "impactfactor"
assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
def filter_title(title):
title = title.strip()
@@ -47,14 +52,14 @@ def filter_title(title):
title_slug = tokenize(title, remove_whitespace=True)
if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
- if title_slug.startswith('nr'):
+ if title_slug.startswith("nr"):
return None
- if title.lower().replace('.', '').startswith('int j '):
+ if title.lower().replace(".", "").startswith("int j "):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
- title.replace(prefix, '')
+ title.replace(prefix, "")
if title.startswith("The Journal of "):
return None
@@ -78,63 +83,84 @@ def filter_title(title):
return None
# too deep subtitling/splitting
- if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
return None
return title
+
def filter_author_name(name):
- name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+ name = name["name"]
+ if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
return None
- return ' '.join([t for t in name.split() if tokenize(t)])
+ return " ".join([t for t in name.split() if tokenize(t)])
+
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
def filter_refs(l):
# TODO:
return l
+
def filter_journal_name(name):
# same denylist, for now
if not name:
return None
- name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
slug_name = tokenize(name)
if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ for prefix in (
+ "/ ",
+ "~ ",
+ "& ",
+ "© ",
+ "Original Research Article ",
+ "Original Article ",
+ "Research Article ",
+ "Available online www.jocpr.com ",
+ ):
if name.startswith(prefix):
- name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ name = name.replace(prefix, "")
+ for suffix in (
+ " Available online at www.sciarena.com",
+ " Original Article",
+ " Available online at",
+ " ISSN",
+ " ISSUE",
+ ):
if name.endswith(suffix):
- name = name.replace(suffix, '')
+ name = name.replace(suffix, "")
if "====================" in name:
return None
if len(name) > 150:
return None
- return ' '.join(name.split())
+ return " ".join(name.split())
+
def filter_metadata(obj):
- if not (obj.get('title') and obj.get('authors')):
+ if not (obj.get("title") and obj.get("authors")):
return None
- title = filter_title(obj['title'])
+ title = filter_title(obj["title"])
if not title:
- #sys.stderr.write("bad title\n")
+ # sys.stderr.write("bad title\n")
return None
else:
- obj['title'] = title
- obj['authors'] = filter_authors(obj['authors'])
- obj['citations'] = filter_refs(obj['citations'])
- obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+ obj["title"] = title
+ obj["authors"] = filter_authors(obj["authors"])
+ obj["citations"] = filter_refs(obj["citations"])
+ obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
return obj
+
def run(invert=False):
for line in sys.stdin:
- fields = line.split('\t')
+ fields = line.split("\t")
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
@@ -151,9 +177,10 @@ def run(invert=False):
fields[4] = processed
else:
fields[0] = processed
- print('\t'.join(fields))
+ print("\t".join(fields))
elif invert:
print(raw.strip())
-if __name__=="__main__":
+
+if __name__ == "__main__":
run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
@@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50
REQUIRE_AUTHORS = False
+
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -51,7 +53,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -59,20 +61,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, left, right)
def process_group(rows):
@@ -86,10 +90,10 @@ def process_group(rows):
left = json.loads(row[1])
right = json.loads(row[2])
# authors must roughly match
- if not check_authors(left['authors'], right['authors']):
+ if not check_authors(left["authors"], right["authors"]):
continue
# years must match (if defined)
- if left['year'] and right['year'] and left['year'] != right['year']:
+ if left["year"] and right["year"] and left["year"] != right["year"]:
continue
filtered.append((left, right))
@@ -101,8 +105,8 @@ def process_group(rows):
group_ids = set()
for row in filtered[1:]:
(left, right) = row
- l_id = left['fatcat_release']
- r_id = right['fatcat_release']
+ l_id = left["fatcat_release"]
+ r_id = right["fatcat_release"]
releases[l_id] = left
releases[r_id] = right
if not group_ids:
@@ -119,6 +123,7 @@ def process_group(rows):
print(json.dumps([releases[ident] for ident in group_ids]))
+
def run():
last_slug = None
@@ -126,7 +131,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -140,5 +145,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
-import sys
import json
+import sys
# out of 1000
score_threshold = 900
@@ -23,15 +23,16 @@ require_authors = 1
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -44,7 +45,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -52,20 +53,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert not check_authors([], [])
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, grobid, crossref)
def process_group(rows):
@@ -78,20 +81,21 @@ def process_group(rows):
continue
grobid = json.loads(row[1])
crossref = json.loads(row[2])
- if not check_authors(crossref['authors'], grobid['authors']):
- #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ if not check_authors(crossref["authors"], grobid["authors"]):
+ # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
continue
else:
- #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
pass
- sha1 = grobid['sha1']
- doi = crossref['doi'].lower()
+ sha1 = grobid["sha1"]
+ doi = crossref["doi"].lower()
l = keepers.get(sha1, list())
l.append(doi)
keepers[sha1] = l
for sha1, doi_list in keepers.items():
print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
def run():
last_slug = None
@@ -99,7 +103,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -112,5 +116,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..90a0f77 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -10,43 +9,49 @@ Run in bulk like:
ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
-import sys
import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
def parse_hbase(line):
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 2
sha1hex = line[0]
obj = json.loads(line[1])
- tei_xml = obj['tei_xml']
+ tei_xml = obj["tei_xml"]
return sha1hex, tei_xml
+
def parse_pg(line):
obj = json.loads(line)
- return obj['sha1hex'], obj['tei_xml']
+ return obj["sha1hex"], obj["tei_xml"]
+
-def run(mode='hbase'):
+def run(mode="hbase"):
for line in sys.stdin:
- if mode == 'hbase':
+ if mode == "hbase":
sha1hex, tei_xml = parse_hbase(line)
- elif mode == 'pg':
+ elif mode == "pg":
sha1hex, tei_xml = parse_pg(line)
else:
- raise NotImplementedError('parse mode: {}'.format(mode))
+ raise NotImplementedError("parse mode: {}".format(mode))
- obj = teixml2json(tei_xml, encumbered=False)
+ tei_doc = parse_document_xml(tei_xml)
+ tei_doc.remove_encumbered()
+ obj = tei_doc.to_legacy_dict()
affiliations = []
- for author in obj['authors']:
- if author.get('affiliation'):
- affiliations.append(author['affiliation'])
+ for author in obj["authors"]:
+ if author.get("affiliation"):
+ affiliations.append(author["affiliation"])
if affiliations:
# don't duplicate affiliations; only the unique ones
affiliations = list(set([json.dumps(a) for a in affiliations]))
affiliations = [json.loads(a) for a in affiliations]
- print('\t'.join([sha1hex, json.dumps(affiliations)]))
+ print("\t".join([sha1hex, json.dumps(affiliations)]))
+
-if __name__=='__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 3d2e14c..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,69 +1,67 @@
#!/usr/bin/env python3
-import sys
-import json
import datetime
+import json
+import sys
+
+MAX_ABSTRACT_BYTES = 4096
-MAX_ABSTRACT_BYTES=4096
def parse_grobid_json(obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra = dict()
- if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
- abobj = dict(
- mimetype="text/plain",
- language=None,
- content=obj.get('abstract').strip())
+ if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+ abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for a in obj.get('authors', []):
+ for a in obj.get("authors", []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
- if raw.get('title'):
- ref['title'] = raw['title'].strip()
- if raw.get('date'):
+ ref["key"] = raw.get("id")
+ if raw.get("title"):
+ ref["title"] = raw["title"].strip()
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
- ref['year'] = year
+ year = int(raw["date"].strip()[:4])
+ ref["year"] = year
except:
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
extra[key] = raw[key].strip()
- if raw.get('authors'):
- extra['authors'] = [a['name'] for a in raw['authors']]
+ if raw.get("authors"):
+ extra["authors"] = [a["name"] for a in raw["authors"]]
if extra:
extra = dict(grobid=extra)
else:
extra = None
- ref['extra'] = extra
+ ref["extra"] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
- if obj.get('date'):
+ if obj.get("date"):
# TODO: only returns year, ever? how to handle?
- release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+ release_date = datetime.datetime(year=obj["date"], month=1, day=1)
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"].lower()
+ if obj["journal"].get("name"):
+ extra["container_name"] = obj["journal"]["name"]
- extra['is_longtail_oa'] = True
+ extra["is_longtail_oa"] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -73,15 +71,17 @@ def parse_grobid_json(obj):
extra = None
return dict(
- title=obj['title'].strip(),
+ title=obj["title"].strip(),
contribs=contribs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=obj["journal"].get("publisher"),
+ volume=obj["journal"].get("volume"),
+ issue=obj["journal"].get("issue"),
abstracts=abstracts,
release_type=release_type,
release_date=release_date,
- extra=extra)
+ extra=extra,
+ )
+
def run():
for line in sys.stdin:
@@ -90,5 +90,6 @@ def run():
if out:
print(out)
-if __name__=="__main__":
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..8a353ca 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is used to turn ingest request postgres rows (in JSON export
format) back in to regular ingest request JSON.
@@ -7,24 +6,25 @@ format) back in to regular ingest request JSON.
The only difference is the name and location of some optional keys.
"""
-import sys
-import json
import argparse
+import json
+import sys
def transform(row):
"""
dict-to-dict
"""
- row.pop('created', None)
- extra = row.pop('request', None) or {}
- for k in ('ext_ids', 'edit_extra'):
+ row.pop("created", None)
+ extra = row.pop("request", None) or {}
+ for k in ("ext_ids", "edit_extra"):
if k in extra:
row[k] = extra[k]
- if 'release_ident' in extra:
- row['fatcat'] = dict(release_ident=extra['release_ident'])
+ if "release_ident" in extra:
+ row["fatcat"] = dict(release_ident=extra["release_ident"])
return row
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -33,19 +33,27 @@ def run(args):
req = transform(json.loads(l))
except:
print(l, file=sys.stderr)
+ if args.force_recrawl:
+ req["force_recrawl"] = True
print(json.dumps(req, sort_keys=True))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="whether to add recrawl (SPNv2) flag to request",
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
to JSON format for fast fatcat importing.
"""
-import sys
import json
import sqlite3
+import sys
# iterate over rows in files metadata...
# 1. select all identified DOIs
@@ -20,6 +20,7 @@ import sqlite3
# 2. select all file metadata
# 3. output object
+
def or_none(s):
if s is None:
return None
@@ -27,6 +28,7 @@ def or_none(s):
return None
return s
+
def process_db(db_path):
db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
print(json.dumps(obj))
-if __name__=="__main__":
+
+if __name__ == "__main__":
process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,19 +1,18 @@
#!/usr/bin/env python3
-
"""
Transform an OAI-PMH bulk dump (JSON) into ingest requests.
Eg: https://archive.org/details/oai_harvest_20200215
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
-
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
# OAI specific additions
"://hdl.handle.net/",
]
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
RELEASE_STAGE_MAP = {
- 'info:eu-repo/semantics/draftVersion': 'draft',
- 'info:eu-repo/semantics/submittedVersion': 'submitted',
- 'info:eu-repo/semantics/acceptedVersion': 'accepted',
- 'info:eu-repo/semantics/publishedVersion': 'published',
- 'info:eu-repo/semantics/updatedVersion': 'updated',
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -50,38 +80,43 @@ def transform(obj):
"""
requests = []
- if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
return []
- if not obj.get('urls'):
+ if not obj.get("urls"):
return []
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
# look in obj['formats'] for PDF?
- if obj.get('formats'):
+ if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
# skip. Note that we will continue if there is no formats list.
has_pdf = False
- for f in obj['formats']:
- if 'pdf' in f.lower():
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
has_pdf = True
if not has_pdf:
return []
doi = None
- if obj.get('doi'):
- doi = obj['doi'][0].lower().strip()
- if not doi.startswith('10.'):
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
doi = None
# infer release stage and/or type from obj['types']
release_stage = None
- for t in obj.get('types', []):
+ for t in obj.get("types", []):
if t in RELEASE_STAGE_MAP:
release_stage = RELEASE_STAGE_MAP[t]
# TODO: infer rel somehow? Eg, repository vs. OJS publisher
rel = None
- for url in obj['urls']:
+ for url in obj["urls"]:
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in url:
@@ -94,23 +129,25 @@ def transform(obj):
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'oai',
- 'link_source_id': obj['oai'].lower(),
- 'ingest_request_source': 'metha-bulk',
- 'release_stage': release_stage,
- 'rel': rel,
- 'ext_ids': {
- 'doi': doi,
- 'oai': obj['oai'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": oai_id,
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "oai": obj["oai"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
+ if doi:
+ request["ext_ids"]["doi"] = doi
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -121,17 +158,20 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file",
help="OAI-PMH dump file to use (usually stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index af08db6..8b57c5b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
@@ -7,6 +6,7 @@ Originally used to benchmark and compare file size/quality.
"""
import sys
+
import poppler
from PIL import Image
@@ -22,13 +22,16 @@ def run(inpath, outpath):
renderer = poppler.PageRenderer()
full_page = renderer.render_page(page)
- img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
- img.thumbnail((180,300), Image.BICUBIC)
- #img.thumbnail((360,600), Image.BICUBIC)
+ img = Image.frombuffer(
+ "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+ )
+ img.thumbnail((180, 300), Image.BICUBIC)
+ # img.thumbnail((360,600), Image.BICUBIC)
img.save(outpath)
- #img.save(outpath, quality=95)
+ # img.save(outpath, quality=95)
+
-if __name__ == '__main__':
+if __name__ == "__main__":
if len(sys.argv) != 3:
print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
sys.exit(-1)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..cb64a1a 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,41 +1,39 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
- "://archive.org/",
- ".archive.org/",
]
RELEASE_STAGE_MAP = {
- 'draftVersion': 'draft',
- 'submittedVersion': 'submitted',
- 'acceptedVersion': 'accepted',
- 'publishedVersion': 'published',
- 'updatedVersion': 'updated',
+ "draftVersion": "draft",
+ "submittedVersion": "submitted",
+ "acceptedVersion": "accepted",
+ "publishedVersion": "published",
+ "updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single unpaywall object to zero or more ingest requests.
@@ -43,48 +41,49 @@ def transform(obj):
"""
requests = []
- if not obj['doi'].startswith('10.'):
+ if not obj["doi"].startswith("10."):
return requests
- if not obj['oa_locations']:
+ if not obj["oa_locations"]:
return requests
- for location in obj['oa_locations']:
- if not location['url_for_pdf']:
+ for location in obj["oa_locations"]:
+ if not location["url_for_pdf"]:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in location['url_for_pdf']:
+ if domain in location["url_for_pdf"]:
skip = True
if skip:
continue
try:
- base_url = canon(location['url_for_pdf'])
+ base_url = canon(location["url_for_pdf"])
except UnicodeEncodeError:
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'unpaywall',
- 'link_source_id': obj['doi'].lower(),
- 'ingest_request_source': 'unpaywall',
- 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
- 'rel': location['host_type'],
- 'ext_ids': {
- 'doi': obj['doi'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "unpaywall",
+ "link_source_id": obj["doi"].lower(),
+ "ingest_request_source": "unpaywall",
+ "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+ "rel": location["host_type"],
+ "ext_ids": {
+ "doi": obj["doi"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
- if obj.get('oa_status'):
- request['edit_extra']['oa_status'] = obj['oa_status']
- if location.get('evidence'):
- request['edit_extra']['evidence'] = location['evidence']
- if location['pmh_id']:
- request['ext_ids']['pmh_id'] = location['pmh_id']
+ if obj.get("oa_status"):
+ request["edit_extra"]["oa_status"] = obj["oa_status"]
+ if location.get("evidence"):
+ request["edit_extra"]["evidence"] = location["evidence"]
+ if location["pmh_id"]:
+ request["ext_ids"]["pmh_id"] = location["pmh_id"]
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -95,17 +94,18 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="unpaywall dump file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
new file mode 100644
index 0000000..54d07db
--- /dev/null
+++ b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T22:08:45Z","timestamp":1620684525878},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-64953-1_4","type":"book-chapter","created":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T02:57:20Z","timestamp":1610593040000},"page":"53-71","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mathematical Knowledge and Mathematical Objects"],"prefix":"10.1007","author":[{"given":"Lars-G\u00f6ran","family":"Johansson","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,1,14]]},"reference":[{"key":"4_CR12","doi-asserted-by":"publisher","volume-title":"Deflating existential consequence: A case for nominalism","author":"J Azzouni","year":"2004","unstructured":"Azzouni, J. (2004). Deflating existential consequence: A case for nominalism. New York: Oxford University Press.","DOI":"10.1093\/0195159888.001.0001"},{"key":"4_CR23","doi-asserted-by":"publisher","volume-title":"Foundations of constructive mathematics","author":"M Beeson","year":"1985","unstructured":"Beeson, M. (1985). Foundations of constructive mathematics. Berlin\/Heidelberg: Springer.","DOI":"10.1007\/978-3-642-68952-9"},{"issue":"2","key":"4_CR27","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1093\/philmat\/11.2.176","volume":"11","author":"H Billinge","year":"2003","unstructured":"Billinge, H. (2003). Did bishop have a philosophy of mathematics? Philosophica Mathematica, 11(2), 176\u2013194.","journal-title":"Philosophica Mathematica"},{"key":"4_CR29","doi-asserted-by":"publisher","volume-title":"Constructive analysis","author":"E Bishop","year":"1985","unstructured":"Bishop, E., & Bridges, D. S. (1985). Constructive analysis. Berlin: Springer.","DOI":"10.1007\/978-3-642-61667-9"},{"key":"4_CR37","series-title":"In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.)","volume-title":"Nominalism in the philosophy of mathematics","author":"O Bueno","year":"2014","unstructured":"Bueno, O. (2014). Nominalism in the philosophy of mathematics. In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.). Metaphysics Research Lab, Stanford University."},{"key":"4_CR38","volume-title":"Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen","author":"G Cantor","year":"1883","unstructured":"Cantor, G. (1883). Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen. Leipzig: Teubner."},{"key":"4_CR60","volume-title":"The seas of language","author":"M Dummett","year":"1993","unstructured":"Dummett, M. (1993). The seas of language. Oxford: Clarendon Press."},{"key":"4_CR73","volume-title":"In the light of logic","author":"S Feferman","year":"1998","unstructured":"Feferman, S. (1998). In the light of logic. New York: Oxford University Press."},{"key":"4_CR74","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1093\/0195148770.003.0019","volume-title":"The Oxford handbook of philosophy of mathematics and logic","author":"S Feferman","year":"2005","unstructured":"Feferman, S. (2005). Predicativity. In S. Shapiro (Ed.), The Oxford handbook of philosophy of mathematics and logic (pp. 590\u2013624). New York\/Oxford: Oxford University Press."},{"key":"4_CR77","volume-title":"Science without numbers: A defence of nominalism","author":"H H Field","year":"1980","unstructured":"Field, H. H. (1980). Science without numbers: A defence of nominalism. Oxford: Blackwell."},{"key":"4_CR88","volume-title":"Werke, volume 8","author":"C F Gauss","year":"2011","unstructured":"Gauss, C. F. (2011). Werke, volume 8. Cambridge: Cambridge University Press."},{"key":"4_CR93","unstructured":"Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155\u2013172). Bobs-Merrill company."},{"key":"4_CR103","volume-title":"Mathematics without numbers: Towards a modal-structural interpretation","author":"G Hellman","year":"1989","unstructured":"Hellman, G. (1989). Mathematics without numbers: Towards a modal-structural interpretation. Oxford: Clarendon Press."},{"key":"4_CR126","first-page":"201","volume-title":"Bertrand Russell. Philosopher of the century","author":"G Kreisel","year":"1967","unstructured":"Kreisel, G. (1967). Mathematical logic: What has it done for the philosophy of mathematics? In R. Shoenman (Ed.), Bertrand Russell. Philosopher of the century (pp. 201\u2013272). London: George Allen & Unwin."},{"key":"4_CR135","doi-asserted-by":"crossref","unstructured":"Lear, J. (1980). Aristotelian infinity. Proceedings of the Aristotelian Society, New Series, 80, 187\u2013210.","DOI":"10.1093\/aristotelian\/80.1.187"},{"key":"4_CR175","doi-asserted-by":"publisher","first-page":"63","DOI":"10.12775\/LLP.1998.004","volume":"6","author":"F Pataut","year":"1998","unstructured":"Pataut, F. (1998). Incompleteness, constructivism and truth. Logic and Logical Philosophy, 6, 63\u201376.","journal-title":"Logic and Logical Philosophy"},{"key":"4_CR180","first-page":"294","volume":"14","author":"H Poincar\u00e9","year":"1906","unstructured":"Poincar\u00e9, H. (1906). Les math\u00e9matiques et la logique. Revue de m\u00e9taphysique et de morale, 14, 294\u2013317.","journal-title":"Revue de m\u00e9taphysique et de morale"},{"key":"4_CR190","volume-title":"Word and object","author":"W V O Quine","year":"1960","unstructured":"Quine, W. V. O. (1960). Word and object. Cambridge, MA: MIT Press."},{"key":"4_CR193","unstructured":"Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133\u2013136). Cambridge, MA: Harvard University Press."},{"key":"4_CR197","first-page":"31","volume-title":"Theories and things","author":"W V O Quine","year":"1981","unstructured":"Quine, W. V. O. (1981c). What price bivalence? In Theories and things (pp. 31\u201337). Cambridge, MA: The Belknap Press of Harvard University Press."},{"issue":"1","key":"4_CR198","doi-asserted-by":"publisher","first-page":"5","DOI":"10.2307\/2026889","volume":"89","author":"WV O Quine","year":"1992","unstructured":"Quine, W.V. O. (1992). Structure and nature. The Journal of Philosophy, 89(1), 5\u20139.","journal-title":"The Journal of Philosophy"},{"key":"4_CR199","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1080\/014453401625669","volume":"25","author":"P Raatikainen","year":"2004","unstructured":"Raatikainen, P. (2004). Conceptions of truth in intuitionism. History and Philosophy of Logic, 25, 131\u2013145.","journal-title":"History and Philosophy of Logic"},{"key":"4_CR210","unstructured":"Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29\u201353."},{"key":"4_CR212","volume-title":"Introduction to mathematical philosophy","author":"B Russell","year":"1919","unstructured":"Russell, B. (1919). Introduction to mathematical philosophy. London: Routledge."},{"key":"4_CR222","doi-asserted-by":"crossref","unstructured":"Schwarz, J. T. (2006(1966)). The pernicious influence of mathematics on science. In R. Hersch (Ed.), 18 unconventional essays on the nature of mathematics (Chap. 13, pp. 231\u2013235). New York: Springer.","DOI":"10.1007\/0-387-29831-2_13"},{"key":"4_CR233","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/BF00247187","volume":"12","author":"G Sundholm","year":"1983","unstructured":"Sundholm, G. (1983). Constructions, proofs and the meaning of logical constants. Journal of Philosophical Logic, 12, 151\u2013172.","journal-title":"Journal of Philosophical Logic"},{"issue":"2","key":"4_CR235","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1007\/s10701-007-9186-9","volume":"38","author":"M Tegmark","year":"2008","unstructured":"Tegmark, M. (2008). The mathematical universe. Foundations of Physics, 38(2), 101\u2013150.","journal-title":"Foundations of Physics"},{"key":"4_CR262","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/0010-0277(90)90003-3","volume":"36","author":"K Wynn","year":"1990","unstructured":"Wynn, K. (1990). Children\u2019s understanding of counting. Cognition, 36, 155\u2013193.","journal-title":"Cognition"}],"container-title":["Synthese Library","Empiricism and Philosophy of Physics"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-64953-1_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T03:00:39Z","timestamp":1610593239000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":28,"URL":"http:\/\/dx.doi.org\/10.1007\/978-3-030-64953-1_4","relation":{},"ISSN":["0166-6991","2542-8292"],"issn-type":[{"value":"0166-6991","type":"print"},{"value":"2542-8292","type":"electronic"}],"published":{"date-parts":[[2021]]},"assertion":[{"value":"14 January 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}} \ No newline at end of file
diff --git a/python/tests/files/crossref_api_work_s1047951103000064.json b/python/tests/files/crossref_api_work_s1047951103000064.json
new file mode 100644
index 0000000..dfb795d
--- /dev/null
+++ b/python/tests/files/crossref_api_work_s1047951103000064.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,6,10]],"date-time":"2021-06-10T05:35:02Z","timestamp":1623303302043},"reference-count":46,"publisher":"Cambridge University Press (CUP)","issue":"1","license":[{"start":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T00:00:00Z","timestamp":1113782400000},"content-version":"unspecified","delay-in-days":807,"URL":"https:\/\/www.cambridge.org\/core\/terms"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cardiol Young"],"published-print":{"date-parts":[[2003,2]]},"abstract":"<jats:p>We designed a multi-hospital prospective study of children less than 12 years to determine the comparative clinical profile, severity of carditis, and outcome on follow up of patients suffering an initial and recurrent episodes of acute rheumatic fever. The study extended over a period of 3 years, with diagnosis based on the Jones criteria. We included 161 children in the study, 57 having only one episode and 104 with recurrent episodes. Those seen in the first episode were differentiated from those with recurrent episodes on the basis of the history. The severity of carditis was graded by clinical and echocardiographic means. In those suffering their first episode, carditis was significantly less frequent (61.4%) compared to those having recurrent episodes (96.2%). Arthritis was more marked in the first episode (61.4%) compared to recurrent episodes (36.5%). Chorea was also significantly higher in the first episode (15.8%) compared to recurrent episodes (3.8%). Sub-cutaneous nodules were more-or-less the same in those suffering the first (7%) as opposed to recurrent episodes (5.8%), but Erythema marginatum was more marked during the first episode (3.5%), being rare in recurrent episodes at 0.9%. Fever was recorded in approximately the same numbers in first (45.6%) and recurrent episodes (48.1%). Arthralgia, in contrast, was less frequent in first (21.1%) compared to recurrent episodes (32.7%). A history of sore throat was significantly increased amongst those suffering the first episode (54.4%) compared to recurrent episodes (21.2%). When we compared the severity of carditis in the first versus recurrent episodes, at the start of study mild carditis was found in 29.8% versus 10.6%, moderate carditis in 26.3% versus 53.8%, and severe carditis in 5.3% versus 31.8% of cases, respectively. At the end of study, 30.3% of patients suffering their first episode were completely cured of carditis, and all others showed significant improvement compared to those with recurrent episodes, where only 6.8% were cured, little improvement or deterioration being noted in the remainder of the patients. We conclude that the clinical profile of acute rheumatic fever, especially that of carditis, is milder in those suffering their first attack compared to those with recurrent episodes.<\/jats:p>","DOI":"10.1017\/s1047951103000064","type":"journal-article","created":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T11:49:54Z","timestamp":1113824994000},"page":"28-35","source":"Crossref","is-referenced-by-count":11,"title":["Clinical profile of acute rheumatic fever in Pakistan"],"prefix":"10.1017","volume":"13","author":[{"given":"Hasina Suleman","family":"Chagani","sequence":"first","affiliation":[]},{"given":"Kalimuddin","family":"Aziz","sequence":"additional","affiliation":[]}],"member":"56","published-online":{"date-parts":[[2005,4,18]]},"reference":[{"key":"S1047951103000064_ref010","doi-asserted-by":"crossref","unstructured":"Alan L , Bisno . Group A streptococcal infection and acute rheumatic fever. N Engl J Med 1991; 325: 783\u2013793.","DOI":"10.1056\/NEJM199109123251106"},{"key":"S1047951103000064_ref036","doi-asserted-by":"crossref","unstructured":"Abbasi AS , Hashmi JA , Robinson RD , Suraya S , Syed SA . Prevalence of heart disease in school children of Karachi. Am J Cardiol 1966; 18: 544\u2013547.","DOI":"10.1016\/0002-9149(66)90008-7"},{"key":"S1047951103000064_ref025","unstructured":"Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285\u2013294."},{"key":"S1047951103000064_ref013","unstructured":"Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185\u2013192."},{"key":"S1047951103000064_ref007","doi-asserted-by":"crossref","unstructured":"Okoroma EO , Ihenacho HNC , Anyanwu CH . Rheumatic fever in Nigerian children. A prospective study of 66 patients. Am J Dis Child 1981; 35: 236\u2013238.","DOI":"10.1001\/archpedi.1981.02130270028010"},{"key":"S1047951103000064_ref031","doi-asserted-by":"crossref","unstructured":"Gordis L . Effectiveness of comprehensive care program in preventing rheumatic fever. N Engl J Med 1973; 289: 331\u2013335.","DOI":"10.1056\/NEJM197308162890701"},{"key":"S1047951103000064_ref012","unstructured":"Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21\u201324."},{"key":"S1047951103000064_ref026","doi-asserted-by":"crossref","unstructured":"Reale A , Colella C , Bruno AM . Mitral stenosis in childhood: Clinical and therapeutic aspects. Am Heart J 1963; 66: 15.","DOI":"10.1016\/0002-8703(63)90064-4"},{"key":"S1047951103000064_ref046","doi-asserted-by":"crossref","unstructured":"Aziz KU , Cheema L , Memon AD . Long-term observations of rheumatic carditis. Cardiol Young 1992; 2: 254\u2013260.","DOI":"10.1017\/S1047951100001001"},{"key":"S1047951103000064_ref041","unstructured":"Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300\u2013305."},{"key":"S1047951103000064_ref002","unstructured":"Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889."},{"key":"S1047951103000064_ref043","unstructured":"Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336\u2013345."},{"key":"S1047951103000064_ref037","unstructured":"Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2\u20136."},{"key":"S1047951103000064_ref029","doi-asserted-by":"crossref","unstructured":"Hassel TA , Stuart KL . Rheumatic fever prophylaxis. A three-year study. Br Med J 1972; 2: 39\u201340.","DOI":"10.1136\/bmj.2.5909.39"},{"key":"S1047951103000064_ref024","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Berry AM , Duggal S , Hooja V , Ghosh S . Sequel of initial attack of acute rheumatic fever. A prospective 5-year follow-up study. Circulation 1982; 65: 375\u2013379.","DOI":"10.1161\/01.CIR.65.2.375"},{"key":"S1047951103000064_ref022","doi-asserted-by":"crossref","unstructured":"Brownell KD , Rese FB . Acute rheumatic fever in children. Incidence in Borough of New York city. JAMA. 1973; 224: 1593\u20131597.","DOI":"10.1001\/jama.1973.03220260015004"},{"key":"S1047951103000064_ref035","unstructured":"Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071\u20131081."},{"key":"S1047951103000064_ref003","unstructured":"El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980's. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183\u2013203."},{"key":"S1047951103000064_ref045","doi-asserted-by":"crossref","unstructured":"Markowitz M . Eradication of rheumatic fever. An unfulfilled hope. Circulation 1970; 41: 1077\u20131084.","DOI":"10.1161\/01.CIR.41.6.1077"},{"key":"S1047951103000064_ref005","unstructured":"Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886."},{"key":"S1047951103000064_ref017","unstructured":"Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483\u2013494."},{"key":"S1047951103000064_ref028","doi-asserted-by":"crossref","unstructured":"Ehmke DA , Stehbens JA , Young L . Two studies of compliance with daily prophylaxis in rheumatic fever patients in Iowa. Am J Public Health 1980; 70: 1189\u20131193.","DOI":"10.2105\/AJPH.70.11.1189"},{"key":"S1047951103000064_ref021","doi-asserted-by":"crossref","unstructured":"Ward C . The reappraisal of the clinical features in acute and chronic rheumatic heart disease. Etiology implications. Am Heart J 1979; 98: 298\u2013306.","DOI":"10.1016\/0002-8703(79)90040-1"},{"key":"S1047951103000064_ref009","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Thaper MK , Ahmed SA , Hooja V , Tewari P . The initial attack of acute rheumatic fever during childhood in North India. A prospective study of the clinical profile. Circulation 1974; 49: 7\u201312.","DOI":"10.1161\/01.CIR.49.1.7"},{"key":"S1047951103000064_ref016","unstructured":"Strasser T . Rheumatic fever and rheumatic heart disease in the 1970's. WHO Chron. 1978; 32: 18\u201325."},{"key":"S1047951103000064_ref019","doi-asserted-by":"crossref","unstructured":"Bland EF , Jones TD . Rheumatic fever and rheumatic heart disease. A twenty-year report on 1000 patients followed since childhood. Circulation 1951; 4: 836\u2013843.","DOI":"10.1161\/01.CIR.4.6.836"},{"key":"S1047951103000064_ref042","doi-asserted-by":"crossref","unstructured":"Wood HF , McCarty M . Laboratory aids in the diagnosis of rheumatic fever and evaluation of disease activity. Am J Med 1954; 17: 768\u2013774.","DOI":"10.1016\/0002-9343(54)90221-1"},{"key":"S1047951103000064_ref020","doi-asserted-by":"crossref","unstructured":"Baldwin JS , Kerr JM , Kuttner AG , Doyle EF . Observation in rheumatic nodules over 30 years period. J Pediatr 1960; 56: 465\u2013470.","DOI":"10.1016\/S0022-3476(60)80358-7"},{"key":"S1047951103000064_ref004","doi-asserted-by":"crossref","unstructured":"Majeed HA , Khan N , Dabbagh M , Naidi K . Acute rheumatic fever during childhood in Kuwait: The mild nature of initial attack. Ann Trop Paediatr 1981; 1: 13\u201320.","DOI":"10.1080\/02724936.1981.11748053"},{"key":"S1047951103000064_ref001","unstructured":"Brittanica: Book of year 1991. Chicago, 1991."},{"key":"S1047951103000064_ref039","unstructured":"Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990."},{"key":"S1047951103000064_ref040","doi-asserted-by":"crossref","unstructured":"Taranta A , Markowitz M . Rheumatic fever. A guide to its recognition, prevention and cure, with special reference to developing countries. M.T.P. Press Ltd., Boston, 1981.","DOI":"10.1007\/978-94-015-7171-5"},{"key":"S1047951103000064_ref032","unstructured":"Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1\u201315."},{"key":"S1047951103000064_ref014","unstructured":"Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2\u20139."},{"key":"S1047951103000064_ref011","doi-asserted-by":"crossref","unstructured":"Gharib R . Acute rheumatic fever in Shiraz, Iran. It's prevalence and characteristics in two socio-economic groups. Am J Dis Child 1969: 118: 694\u2013699.","DOI":"10.1001\/archpedi.1969.02100040696005"},{"key":"S1047951103000064_ref008","unstructured":"Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543\u2013550."},{"key":"S1047951103000064_ref033","doi-asserted-by":"crossref","unstructured":"Spagnuolo M , Pasternack B , Taranta A . Risk of rheumatic fever recurrences after streptococcal infections. Prospective study of clinical and social factors. N Engl J Med 1971; 285: 641\u2013647.","DOI":"10.1056\/NEJM197109162851201"},{"key":"S1047951103000064_ref038","unstructured":"Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539\u2013549."},{"key":"S1047951103000064_ref023","doi-asserted-by":"crossref","unstructured":"Feinstein AR , Spagnuolo M . The clinical patterns of acute rheumatic fever; A reappraisal. Medicine 1962; 41: 279\u2013305.","DOI":"10.1097\/00005792-196212000-00001"},{"key":"S1047951103000064_ref018","unstructured":"Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501\u20131515."},{"key":"S1047951103000064_ref027","unstructured":"Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8\u201314."},{"key":"S1047951103000064_ref034","unstructured":"Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14\u201316."},{"key":"S1047951103000064_ref044","unstructured":"Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389\u2013395."},{"key":"S1047951103000064_ref006","unstructured":"Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849\u2013853."},{"key":"S1047951103000064_ref030","unstructured":"Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599\u2013603."},{"key":"S1047951103000064_ref015","doi-asserted-by":"crossref","unstructured":"Robinson RD , Sultana S , Abbasi AS et al. Acute rheumatic fever in Karachi, Pakistan. Am J Cardiol 1966; 8: 548\u2013551.","DOI":"10.1016\/0002-9149(66)90009-9"}],"container-title":["Cardiology in the Young"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.cambridge.org\/core\/services\/aop-cambridge-core\/content\/view\/S1047951103000064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,4,6]],"date-time":"2020-04-06T22:32:57Z","timestamp":1586212377000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2003,2]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2003,2]]}},"alternative-id":["S1047951103000064"],"URL":"http:\/\/dx.doi.org\/10.1017\/s1047951103000064","relation":{},"ISSN":["1047-9511","1467-1107"],"issn-type":[{"value":"1047-9511","type":"print"},{"value":"1467-1107","type":"electronic"}],"subject":["Cardiology and Cardiovascular Medicine","General Medicine","Pediatrics, Perinatology, and Child Health"],"published":{"date-parts":[[2003,2]]}}} \ No newline at end of file
diff --git a/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
new file mode 100644
index 0000000..b47f85b
--- /dev/null
+++ b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
@@ -0,0 +1,66 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName><forename type="first">N</forename><surname>Goodman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Implicit definition sustained</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The ways of paradox and other essays</title>
+ <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting>
+ <imprint>
+ <publisher>Harvard University Press</publisher>
+ <date type="published" when="1976">1976b</date>
+ <biblScope unit="page" from="133" to="136" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title>
+ <author>
+ <persName><forename type="first">B</forename><surname>Russell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1906">1906</date>
+ <publisher>Proceedings of London Mathematical Society</publisher>
+ <biblScope unit="volume">4</biblScope>
+ <biblScope unit="page" from="29" to="53" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/grobid_refs_s1047951103000064.tei.xml b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
new file mode 100644
index 0000000..e0eae8a
--- /dev/null
+++ b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
@@ -0,0 +1,499 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">The community control of rheumatic fever and rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">N</forename><surname>Dondong</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Elkholy</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="285" to="294" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note>Report of a WHO international co-operative project</note>
+ <note type="raw_reference">Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285–294.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Rehman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="185" to="192" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185–192.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever in Sudanese children</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Ismail</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>El Amin</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Arab J Med</title>
+ <imprint>
+ <biblScope unit="volume">2</biblScope>
+ <biblScope unit="page" from="21" to="24" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21–24.</note>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <analytic>
+ <title level="a" type="main">Incidence of heart disease in children at NICVD</title>
+ <author>
+ <persName><forename type="first">K</forename><forename type="middle">U</forename><surname>Aziz</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="300" to="305" />
+ <date type="published" when="1984">1984</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300–305.</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m" type="main">The various manifestations of rheumatic fever as exemplified in childhood and early life</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">B</forename><surname>Cheadle</surname></persName>
+ </author>
+ <imprint>
+ <publisher>Smith and Co</publisher>
+ <biblScope unit="page">1889</biblScope>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889.</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-I. A major public health problem</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="336" to="345" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336–345.</note>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <analytic>
+ <title level="a" type="main">Prevalence of heart disease in school children of Islamabad</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Malik</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Jaffrey</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Ahmed</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">Zubeda</forename><surname>Khanum</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pakistan Heart Journal</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <biblScope unit="page" from="2" to="6" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2–6.</note>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease and overcrowding</title>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Watkins</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Quinn</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Am J Public Health</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="1071" to="1081" />
+ <date type="published" when="1948">1948</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071–1081.</note>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <analytic>
+ <title level="a" type="main">The spectrum and specter of rheumatic fever in 1980&apos;s</title>
+ <author>
+ <persName><forename type="first">W</forename><surname>El-Sadr</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Taranta</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Clinical Immunology Up-Date. Edited by Franklin EC</title>
+ <imprint>
+ <biblScope unit="page" from="183" to="203" />
+ <date type="published" when="1979">1979</date>
+ <publisher>Elsevier</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980&apos;s. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183–203.</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">Tonsillitis in adolescent, Bailliere Tendoll and Cox</title>
+ <author>
+ <persName><forename type="first">C</forename><surname>Haig-Brown</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1886">1886</date>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886.</note>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <analytic>
+ <title level="a" type="main">Studies on the transmission within the families of group A hemolytic streptococci</title>
+ <author>
+ <persName><forename type="first">L</forename><forename type="middle">I</forename><surname>Levine</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Chapman</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Guerra</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><surname>Cooper</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Krause</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">J Lab Clin Med</title>
+ <imprint>
+ <biblScope unit="volume">67</biblScope>
+ <biblScope unit="page" from="483" to="494" />
+ <date type="published" when="1966">1966</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483–494.</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="volume">32</biblScope>
+ <biblScope unit="page" from="18" to="25" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Strasser T . Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron. 1978; 32: 18–25.</note>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Brittanica: Book of year 1991</title>
+ <imprint>
+ <date type="published" when="1991">1991</date>
+ <publisher>Chicago</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Brittanica: Book of year 1991. Chicago, 1991.</note>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <monogr>
+ <title level="m" type="main">Pockets of rheumatic fever in developed world. XI World Congress of Cardiology</title>
+ <author>
+ <persName><forename type="first">R</forename><surname>Talbot</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990">1990</date>
+ <pubPlace>Manila</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990.</note>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease</title>
+ </analytic>
+ <monogr>
+ <title level="j">Circulation</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="A1" to="15" />
+ <date type="published" when="1970">1970</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1–15.</note>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever and rheumatic carditis in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Shafqat</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Ramzan</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="2" to="9" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2–9.</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in developing countries</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Padmavati</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">56</biblScope>
+ <biblScope unit="page" from="543" to="550" />
+ <date type="published" when="1979">1979</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543–550.</note>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">Streptococcal infections in families. Factors altering individual susceptibility</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Meyer</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Haggerty</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pediatrics</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="539" to="549" />
+ <date type="published" when="1962">1962</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539–549.</note>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Collagen and connective tissue diseases</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Shanks</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Textbook of Pediatrics</title>
+ <editor>
+ <persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Forfar</surname></persName>
+ <persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Arneil</surname></persName>
+ </editor>
+ <meeting><address><addrLine>Edinburgh</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="page" from="1501" to="1515" />
+ </imprint>
+ <respStmt>
+ <orgName>Churchill Livingstone</orgName>
+ </respStmt>
+ </monogr>
+ <note type="raw_reference">Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501–1515.</note>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Prophylaxis against recurrence of rheumatic fever</title>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Billoo</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">S</forename><surname>Abbasi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Sultana</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">L</forename><surname>Desa</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <biblScope unit="page" from="8" to="14" />
+ <date type="published" when="1968">1968</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8–14.</note>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="page" from="14" to="16" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14–16.</note>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="389" to="395" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389–395.</note>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever: Clinical profile of 339 cases with long term follow-up</title>
+ <author>
+ <persName><forename type="first">M</forename><forename type="middle">K</forename><surname>Joshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">P</forename><forename type="middle">W</forename><surname>Kandoth</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Barve</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Kamat</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Indian pediatr</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="849" to="853" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849–853.</note>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in rural south Indian children</title>
+ <author>
+ <persName><forename type="first">G</forename><surname>Koshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Benjamin</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><surname>Cherian</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="599" to="603" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599–603.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index 3f84ea4..3839c99 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -27,21 +27,16 @@
"date": "2001",
"id": "b0",
"index": 0,
- "issue": null,
"journal": "Letters in the Alphabet",
- "publisher": null,
+ "pages": "1-11",
"title": "Everything is Wonderful",
- "url": null,
"volume": "20"},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
- "issue": null,
"journal": "The Dictionary",
- "publisher": null,
"title": "All about Facts",
- "url": null,
"volume": "14"}
],
"abstract": "Everything you ever wanted to know about nothing",
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 36d90ef..dce64bc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,17 +1,18 @@
+import json
+import struct
import pytest
-import struct
import responses
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from test_wayback import wayback_client, cdx_client
-
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
-with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
+with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f:
REAL_TEI_XML = f.read()
+
@pytest.fixture
def grobid_client():
client = GrobidClient(
@@ -19,61 +20,203 @@ def grobid_client():
)
return client
+
@responses.activate
def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=503,
- body=status)
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=503,
+ body=status,
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 503
- assert resp['status'] == "error"
+ assert resp["status_code"] == 503
+ assert resp["status"] == "error"
+
+
+@responses.activate
+def test_grobid_success_iso_8859(grobid_client):
+ """
+ This might have been the old GROBID behavior, with default encoding? Can't really remember.
+ """
+
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ # print(type(resp['tei_xml']))
+ # print(type(REAL_TEI_XML))
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1")
+
@responses.activate
def test_grobid_success(grobid_client):
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="application/xml; charset=UTF-8",
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 200
- assert resp['status'] == "success"
- #print(type(resp['tei_xml']))
- #print(type(REAL_TEI_XML))
- assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8")
+
@responses.activate
-def test_grobid_worker_cdx(grobid_client, wayback_client):
+def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
+ assert len(responses.calls) == worker.counts["total"]
+
+
+@responses.activate
+def test_grobid_refs_978(grobid_client):
+
+ with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f:
+ xml_bytes = f.read()
+ assert "\u2013".encode("utf-8") in xml_bytes
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=xml_bytes,
+ content_type="application/xml; charset=UTF-8",
+ )
- assert len(responses.calls) == worker.counts['total']
+ refs_row = grobid_client.crossref_refs(crossref_work)
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 3
+ assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"])
+
+ # test case of no references
+ crossref_work["message"]["reference"] = []
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # test that 'message' works also
+ refs_row = grobid_client.crossref_refs(crossref_work["message"])
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # grobid gets no additional POST from the above empty queries
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_grobid_refs_s104(grobid_client):
+
+ # test another file
+ with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f:
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=f.read(),
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # GROBID gets one more POST
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1017/s1047951103000064"
+ assert refs_row["source_ts"] == "2021-06-10T05:35:02Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 24
+ assert set([r["id"] for r in refs]) == set(
+ [
+ "S1047951103000064_ref025",
+ "S1047951103000064_ref013",
+ "S1047951103000064_ref012",
+ "S1047951103000064_ref041",
+ "S1047951103000064_ref002",
+ "S1047951103000064_ref043",
+ "S1047951103000064_ref037",
+ "S1047951103000064_ref035",
+ "S1047951103000064_ref003",
+ "S1047951103000064_ref005",
+ "S1047951103000064_ref017",
+ "S1047951103000064_ref016",
+ "S1047951103000064_ref001",
+ "S1047951103000064_ref039",
+ "S1047951103000064_ref032",
+ "S1047951103000064_ref014",
+ "S1047951103000064_ref008",
+ "S1047951103000064_ref038",
+ "S1047951103000064_ref018",
+ "S1047951103000064_ref027",
+ "S1047951103000064_ref034",
+ "S1047951103000064_ref044",
+ "S1047951103000064_ref006",
+ "S1047951103000064_ref030",
+ ]
+ )
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..b00a88d 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,22 +1,28 @@
-
-import xml
import json
+import xml
+
import pytest
-from grobid2json import *
+from grobid_tei_xml import parse_document_xml
def test_small_xml():
-
- with open('tests/files/small.xml', 'r') as f:
+ """
+ This used to be a test of grobid2json; now it is a compatability test for
+ the to_legacy_dict() feature of grobid_tei_xml.
+ """
+
+ with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
- with open('tests/files/small.json', 'r') as f:
- json_form = json.loads(f.read())
+ with open("tests/files/small.json", "r") as f:
+ json_form = json.loads(f.read())
+
+ tei_doc = parse_document_xml(tei_xml)
+ assert tei_doc.to_legacy_dict() == json_form
- assert teixml2json(tei_xml) == json_form
def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
- teixml2json("this is not XML")
+ parse_document_xml("this is not XML")
with pytest.raises(ValueError):
- teixml2json("<xml></xml>")
+ parse_document_xml("<xml></xml>")
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 9a81852..043c63d 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,33 +1,7 @@
-
-import json
-import pytest
-import responses
-
from sandcrawler.html import extract_fulltext_url
+
def test_extract_fulltext_url():
resp = extract_fulltext_url("asdf", b"asdf")
assert resp == {}
-
- resp = extract_fulltext_url(
- "http://dummy-site/",
- b"""<html>
- <head>
- <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
- </head>
- <body>
- <h1>my big article here</h1>
- blah
- </body>
- </html>"""
- )
- assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
- assert resp['technique'] == "citation_pdf_url"
-
- with open('tests/files/plos_one_article.html', 'rb') as f:
- resp = extract_fulltext_url(
- "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
- f.read(),
- )
- assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index e6e48ac..ba4acf1 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,14 +1,10 @@
-
-import datetime
-import pytest
-
-from sandcrawler.html_ingest import *
+from sandcrawler.ingest_html import *
def test_html_extract_ojs3() -> None:
- with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f:
+ with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f:
ojs3_html = f.read()
fulltext = html_extract_body_teixml(ojs3_html)
- assert fulltext['status'] == 'success'
+ assert fulltext["status"] == "success"
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index bf26a98..69bd211 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,5 +1,5 @@
-
import datetime
+
import pytest
from sandcrawler.html_metadata import *
@@ -7,14 +7,20 @@ from sandcrawler.html_metadata import *
def test_html_metadata_plos() -> None:
- with open('tests/files/plos_one_article.html', 'r') as f:
+ with open("tests/files/plos_one_article.html", "r") as f:
plos_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
assert meta is not None
- assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ assert (
+ meta.title
+ == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ )
assert meta.doi == "10.1371/journal.pone.0213978"
- assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
assert meta.contrib_names == [
"Yang Li",
"Tuanjie Wang",
@@ -37,17 +43,26 @@ def test_html_metadata_plos() -> None:
assert meta.volume == "14"
assert meta.container_issn == "1932-6203"
assert meta.publisher == "Public Library of Science"
- assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert (
+ meta.raw_references
+ and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"
+ in meta.raw_references
+ )
assert meta.release_type == "article-journal"
- assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
def test_html_metadata_elife() -> None:
-
- with open('tests/files/elife_article.html', 'r') as f:
+
+ with open("tests/files/elife_article.html", "r") as f:
elife_html = f.read()
- meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
+ meta = html_extract_biblio(
+ "https://elifesciences.org/articles/44753", HTMLParser(elife_html)
+ )
assert meta is not None
assert meta.title == "Parallel visual circuitry in a basal chordate"
assert meta.doi == "10.7554/eLife.44753"
@@ -64,28 +79,34 @@ def test_html_metadata_elife() -> None:
# 2019-04-18
assert meta.release_date == datetime.date(year=2019, month=4, day=18)
assert meta.publisher == "eLife Sciences Publications Limited"
- assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ )
def test_html_metadata_peerj() -> None:
-
- with open('tests/files/peerj_oa_article.html', 'r') as f:
+
+ with open("tests/files/peerj_oa_article.html", "r") as f:
peerj_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
assert meta is not None
- assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ assert (
+ meta.title
+ == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ )
assert meta.doi == "10.7717/peerj.4375"
assert meta.contrib_names == [
- "Heather Piwowar",
- "Jason Priem",
- "Vincent Larivière",
- "Juan Pablo Alperin",
- "Lisa Matthias",
- "Bree Norlander",
- "Ashley Farley",
- "Jevin West",
- "Stefanie Haustein",
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
]
assert meta.container_name == "PeerJ"
# "2018-02-13"
@@ -95,7 +116,7 @@ def test_html_metadata_peerj() -> None:
def test_html_metadata_nature() -> None:
- with open('tests/files/nature_article.html', 'r') as f:
+ with open("tests/files/nature_article.html", "r") as f:
nature_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
@@ -110,12 +131,15 @@ def test_html_metadata_nature() -> None:
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
assert meta.publisher == "Nature Publishing Group"
# note: some error in dublin code in nature HTML resulting in duplication
- assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ assert (
+ meta.abstract
+ == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ )
def test_html_metadata_ojs3() -> None:
- with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
ojs3_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
@@ -128,19 +152,25 @@ def test_html_metadata_ojs3() -> None:
"Os Keyes",
]
assert meta.container_name == "First Monday"
- assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
assert meta.container_issn == "1396-0466"
# "2020/09/10"
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
assert meta.lang == "en"
- assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
- assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ assert (
+ meta.abstract
+ == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ )
+ assert (
+ meta.html_fulltext_url
+ == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ )
assert meta.release_type == "article-journal"
def test_html_metadata_dlib() -> None:
- with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
dlib_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
@@ -149,6 +179,7 @@ def test_html_metadata_dlib() -> None:
# "2017-05-15"
assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
def test_html_metadata_dc_case() -> None:
"""
This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
@@ -166,13 +197,15 @@ def test_html_metadata_dc_case() -> None:
assert meta is not None
assert meta.issue == "123"
+
@pytest.fixture
def adblock() -> Any:
return load_adblock_rules()
+
def test_html_resources(adblock) -> None:
- with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
dlib_html = f.read()
resources = html_extract_resources(
@@ -185,9 +218,9 @@ def test_html_resources(adblock) -> None:
# check that adblock working
for r in resources:
- assert '/ga.js' not in r['url']
+ assert "/ga.js" not in r["url"]
- with open('tests/files/plos_one_article.html', 'r') as f:
+ with open("tests/files/plos_one_article.html", "r") as f:
plos_html = f.read()
resources = html_extract_resources(
@@ -198,9 +231,9 @@ def test_html_resources(adblock) -> None:
# check that custom adblock working
for r in resources:
- assert 'crossmark-cdn.crossref.org' not in r['url']
+ assert "crossmark-cdn.crossref.org" not in r["url"]
- with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
monday_html = f.read()
resources = html_extract_resources(
@@ -209,7 +242,7 @@ def test_html_resources(adblock) -> None:
adblock,
)
- with open('tests/files/elife_article.html', 'r') as f:
+ with open("tests/files/elife_article.html", "r") as f:
elife_html = f.read()
resources = html_extract_resources(
@@ -218,7 +251,7 @@ def test_html_resources(adblock) -> None:
adblock,
)
- with open('tests/files/nature_article.html', 'r') as f:
+ with open("tests/files/nature_article.html", "r") as f:
nature_html = f.read()
resources = html_extract_resources(
@@ -226,4 +259,3 @@ def test_html_resources(adblock) -> None:
HTMLParser(nature_html),
adblock,
)
-
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 46346b7..e14a452 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,12 +1,12 @@
-
import json
+
import pytest
import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
from sandcrawler import *
-from test_wayback import *
-from test_savepagenow import *
-from test_grobid import REAL_TEI_XML
@pytest.fixture
@@ -21,6 +21,7 @@ def ingest_worker(wayback_client, spn_client):
)
return worker
+
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
grobid_client = GrobidClient(
@@ -41,153 +42,223 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client):
@responses.activate
def test_ingest_success(ingest_worker_pdf):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=pdf_bytes)
- responses.add(responses.GET,
- 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=pdf_bytes,
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/grobid?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
status=200,
- body=json.dumps([]))
- responses.add(responses.GET,
- 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
status=200,
- body=json.dumps([]))
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
resp = ingest_worker_pdf.process(request)
print(resp)
- assert resp['hit'] == True
- assert resp['status'] == "success"
- assert resp['request'] == request
- assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
- assert type(resp['terminal']['terminal_dt']) == str
- assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
- assert resp['terminal']['terminal_status_code']
- assert type(resp['file_meta']['size_bytes']) == int
- assert resp['file_meta']['mimetype'] == "application/pdf"
- assert resp['cdx']['url'] == TARGET + "/redirect"
- assert 'warc_path' not in resp['cdx']
- assert 'revisit_cdx' not in resp
- assert resp['grobid']['status'] == "success"
- assert resp['grobid']['status_code'] == 200
- assert resp['grobid']['grobid_version']
- assert 'fatcat_release' in resp['grobid']
- assert 'grobid_version' not in resp['grobid']['metadata']
- assert 'fatcat_release' not in resp['grobid']['metadata']
- assert not 'tei_xml' in resp['grobid']
- assert resp['pdf_meta']['status'] == "success"
- assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
- assert resp['pdf_meta'].get('text') is None
+ assert resp["hit"] is True
+ assert resp["status"] == "success"
+ assert resp["request"] == request
+ assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"]
+ assert type(resp["terminal"]["terminal_dt"]) == str
+ assert resp["terminal"]["terminal_url"] == TARGET + "/redirect"
+ assert resp["terminal"]["terminal_status_code"]
+ assert type(resp["file_meta"]["size_bytes"]) == int
+ assert resp["file_meta"]["mimetype"] == "application/pdf"
+ assert resp["cdx"]["url"] == TARGET + "/redirect"
+ assert "warc_path" not in resp["cdx"]
+ assert "revisit_cdx" not in resp
+ assert resp["grobid"]["status"] == "success"
+ assert resp["grobid"]["status_code"] == 200
+ assert resp["grobid"]["grobid_version"]
+ assert "fatcat_release" in resp["grobid"]
+ assert "grobid_version" not in resp["grobid"]["metadata"]
+ assert "fatcat_release" not in resp["grobid"]["metadata"]
+ assert "tei_xml" not in resp["grobid"]
+ assert resp["pdf_meta"]["status"] == "success"
+ assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1
+ assert resp["pdf_meta"].get("text") is None
+
@responses.activate
def test_ingest_landing(ingest_worker):
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body="<html></html>")
+ body="<html></html>",
+ )
resp = ingest_worker.process(request)
print(resp)
- assert resp['hit'] == False
- assert resp['status'] == "no-pdf-link"
- assert resp['request'] == request
- assert 'terminal' in resp
- assert 'file_meta' not in resp
- assert 'cdx' not in resp
- assert 'revisit_cdx' not in resp
- assert 'grobid' not in resp
+ assert resp["hit"] is False
+ assert resp["status"] == "no-pdf-link"
+ assert resp["request"] == request
+ assert "terminal" in resp
+ assert "file_meta" not in resp
+ assert "cdx" not in resp
+ assert "revisit_cdx" not in resp
+ assert "grobid" not in resp
+
@responses.activate
def test_ingest_blocklist(ingest_worker):
ingest_worker.base_url_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-url-blocklist"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-url-blocklist"
+ assert resp["request"] == request
@responses.activate
def test_ingest_wall_blocklist(ingest_worker):
ingest_worker.wall_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] == False
- assert resp['status'] == "skip-wall"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-wall"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+ assert resp["hit"] is False
+ assert resp["status"] == "blocked-cookie"
+ assert resp["request"] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 429c6b0..9bd8b5f 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -1,4 +1,3 @@
-
"""
This file contains tests to run against "live" wayback services. They default
to "skip" because you need authentication, and we shouldn't hit these services
@@ -7,10 +6,9 @@ automatically in CI.
Simply uncomment lines to run.
"""
-import json
import pytest
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
@pytest.fixture
@@ -18,16 +16,19 @@ def cdx_client():
client = CdxApiClient()
return client
+
@pytest.fixture
def wayback_client():
client = WaybackClient()
return client
+
@pytest.fixture
def spn_client():
client = SavePageNowClient()
return client
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch(cdx_client):
@@ -42,12 +43,16 @@ def test_cdx_fetch(cdx_client):
assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
assert resp.warc_csize == 25338
assert resp.warc_offset == 240665973
- assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ assert (
+ resp.warc_path
+ == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ )
# bogus datetime; shouldn't match
with pytest.raises(KeyError):
resp = cdx_client.fetch(url, "12345678123456")
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_lookup_best(cdx_client):
@@ -66,24 +71,31 @@ def test_cdx_lookup_best(cdx_client):
assert resp.mimetype == "text/html"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_wayback_fetch(wayback_client):
- resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+ resp = wayback_client.fetch_petabox(
+ 25683,
+ 2676464871,
+ "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz",
+ )
assert resp.body
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_resource_success(wayback_client):
url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url in (url, url.replace("https://", "http://"))
assert resp.cdx.url in (url, url.replace("https://", "http://"))
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch_spn2(cdx_client):
@@ -104,9 +116,9 @@ def test_cdx_fetch_spn2(cdx_client):
# https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
- #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
datetime = "20200110222410"
@@ -117,6 +129,7 @@ def test_cdx_fetch_spn2(cdx_client):
assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_ftp(wayback_client):
# ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
@@ -127,29 +140,30 @@ def test_lookup_ftp(wayback_client):
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
- assert resp.terminal_status_code == 226
+ assert resp.terminal_status_code in (226, 200)
assert resp.cdx.url == url
assert resp.revisit_cdx
assert resp.revisit_cdx.url != url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
# not revisit?
url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
- assert resp.terminal_status_code == 226
+ assert resp.terminal_status_code in (226, 200)
assert resp.cdx.url == url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_crawl_ftp(spn_client, wayback_client):
@@ -158,10 +172,10 @@ def test_crawl_ftp(spn_client, wayback_client):
resp = spn_client.crawl_resource(url, wayback_client)
# FTP isn't supported yet!
- #assert resp.hit == True
- #assert resp.status == "success"
- #assert resp.terminal_url == url
- #assert resp.cdx.url == url
+ # assert resp.hit is True
+ # assert resp.status == "success"
+ # assert resp.terminal_url == url
+ # assert resp.cdx.url == url
- assert resp.hit == False
+ assert resp.hit is False
assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 29f9e9f..2bad851 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,77 +1,110 @@
-
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
+from sandcrawler import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_line,
+)
+
def test_gen_file_metadata():
-
+
# valid (but very small) PDF file
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
- b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
- assert fm['mimetype'] == 'text/html'
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
+ )
+ assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
- assert fm['mimetype'] == 'text/plain'
- assert fm['size_bytes'] == 8
+ assert fm["mimetype"] == "text/plain"
+ assert fm["size_bytes"] == 8
+
+
+def test_gen_file_metadata_path():
+
+ # valid (but very small) PDF file
+ file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
+ assert file_meta == {
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
+ }
+
def test_b32_hex():
# valid b32
- assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert (
+ b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
+ assert (
+ b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
# sha1hex pass-through
- s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
- assert b32_hex('blah') == 'blah'
+ assert b32_hex("blah") == "blah"
+
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
- 'mimetype': "application/pdf",
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'datetime': "20170828233154",
- 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'warc_offset': 931661233,
- 'warc_csize': 210251,
- 'http_status': 200,
+ "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ "mimetype": "application/pdf",
+ "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "datetime": "20170828233154",
+ "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "warc_offset": 931661233,
+ "warc_csize": 210251,
+ "http_status": 200,
}
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) is None
+
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
- assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
- "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
-
+ assert (
+ clean_url(
+ "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
+ == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 255e3fb..9d75655 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,68 +1,71 @@
-
-import pytest
import struct
-import responses
+
import poppler
+import pytest
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
from sandcrawler.pdfextract import process_pdf
-from test_wayback import wayback_client, cdx_client
-
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
def test_process_fake_pdf():
resp = process_pdf(FAKE_PDF_BYTES)
print(resp)
assert resp.status == "not-pdf"
- with open('tests/files/dummy_zip.zip', 'rb') as f:
+ with open("tests/files/dummy_zip.zip", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'not-pdf'
+ assert resp.status == "not-pdf"
+
-@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
+@pytest.mark.skipif(
+ poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
+)
def test_process_dummy_pdf():
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'success'
+ assert resp.status == "success"
assert resp.page0_thumbnail is not None
assert len(resp.text) > 10
assert resp.meta_xml is None
- assert resp.file_meta['mimetype'] == 'application/pdf'
+ assert resp.file_meta["mimetype"] == "application/pdf"
print(resp.pdf_info)
print(resp.pdf_extra)
- assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
+ assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
# 595 x 842
- assert resp.pdf_extra['page0_height'] == 842
- assert resp.pdf_extra['page0_width'] == 595
- assert resp.pdf_extra['page_count'] == 1
+ assert resp.pdf_extra["page0_height"] == 842
+ assert resp.pdf_extra["page0_width"] == 595
+ assert resp.pdf_extra["page_count"] == 1
+
-def test_pdfextract_worker_cdx(wayback_client):
+def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
def test_pdfextract_blob_worker():
sink = BlackholeSink()
worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
worker.process(pdf_bytes)
-
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 52f26c0..ed17d24 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,7 +1,4 @@
-
-import pytest
-
-from sandcrawler.workers import CdxLinePusher, BlackholeSink
+from sandcrawler.workers import BlackholeSink, CdxLinePusher
def test_cdx_line_pusher():
@@ -9,20 +6,24 @@ def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['pushed'] == 19
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["pushed"] == 19
# HTTP 200 and application/pdf
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(sink, cdx_file,
- filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ sink,
+ cdx_file,
+ filter_mimetypes=["application/pdf"],
+ filter_http_statuses=[200, 226],
+ )
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['skip-http_status'] == 10
- assert counts['skip-mimetype'] == 2
- assert counts['pushed'] == 7
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["skip-http_status"] == 10
+ assert counts["skip-mimetype"] == 2
+ assert counts["pushed"] == 7
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 63dd887..add2c60 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,11 +1,10 @@
-
import json
+
import pytest
import responses
-
-from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
from test_wayback import *
+from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError
TARGET = "http://dummy-target.dummy"
JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
@@ -16,7 +15,7 @@ PENDING_BODY = {
"https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
"https://cdn.onesignal.com/sdks/OneSignalSDK.js",
- ]
+ ],
}
SUCCESS_BODY = {
"status": "success",
@@ -58,12 +57,12 @@ SUCCESS_BODY = {
"https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
"https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
"https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
- "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
+ "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
],
- "outlinks":{
+ "outlinks": {
"https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
- "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
- }
+ "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695",
+ },
}
ERROR_BODY = {
"status": "error",
@@ -71,13 +70,38 @@ ERROR_BODY = {
"status_ext": "error:invalid-host-resolution",
"job_id": JOB_ID,
"message": "Couldn't resolve host for http://example5123.com.",
- "resources": []
+ "resources": [],
}
CDX_SPN_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180326070330",
+ TARGET + "/redirect",
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz",
+ ],
]
+
@pytest.fixture
def spn_client():
client = SavePageNowClient(
@@ -88,112 +112,216 @@ def spn_client():
client.poll_seconds = 0.0
return client
+
@responses.activate
def test_savepagenow_success(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
+ body=json.dumps(SUCCESS_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 4
+ assert len(responses.calls) == 5
- assert resp.success == True
+ assert resp.success is True
assert resp.status == "success"
assert resp.request_url == TARGET
assert resp.terminal_url == TARGET + "/redirect"
- assert resp.terminal_dt == SUCCESS_BODY['timestamp']
- assert resp.resources == SUCCESS_BODY['resources']
+ assert resp.terminal_dt == SUCCESS_BODY["timestamp"]
+ assert resp.resources == SUCCESS_BODY["resources"]
+
@responses.activate
def test_savepagenow_remote_error(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(ERROR_BODY))
+ body=json.dumps(ERROR_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
- assert len(responses.calls) == 3
+ assert len(responses.calls) == 4
- assert resp.success == False
- assert resp.status == ERROR_BODY['status_ext']
+ assert resp.success is False
+ assert resp.status == ERROR_BODY["status_ext"]
assert resp.request_url == TARGET
- assert resp.terminal_url == None
- assert resp.terminal_dt == None
- assert resp.resources == None
+ assert resp.terminal_url is None
+ assert resp.terminal_dt is None
+ assert resp.resources is None
+
@responses.activate
def test_savepagenow_500(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=500,
- body=json.dumps(ERROR_BODY))
+ body=json.dumps(ERROR_BODY),
+ )
with pytest.raises(SavePageNowError):
- resp = spn_client.save_url_now_v2(TARGET)
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 3
+
+
+@responses.activate
+def test_savepagenow_no_slots(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 0,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+
+ with pytest.raises(SavePageNowBackoffError):
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 1
- assert len(responses.calls) == 2
@responses.activate
def test_crawl_resource(spn_client, wayback_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ body=WARC_BODY,
+ )
- print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
+ print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"))
resp = spn_client.crawl_resource(TARGET, wayback_client)
- assert len(responses.calls) == 5
+ assert len(responses.calls) == 6
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.body == WARC_BODY
assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
@@ -201,4 +329,3 @@ def test_crawl_resource(spn_client, wayback_client):
assert type(resp.cdx) == CdxPartial
with pytest.raises(AttributeError):
print(resp.cdx.warc_path)
-
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6bc1ca4..da4dfd8 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,36 +1,156 @@
-
import json
+
import pytest
import responses
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
-
+from sandcrawler import CdxApiClient, WaybackClient
CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
]
CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner, but not right mimetype
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner and mimetype, but wrong status code
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # "best"
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # older
- ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner, but not right mimetype
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner and mimetype, but wrong status code
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # "best"
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # older
+ [
+ "wiki,fatcat)/",
+ "20180712220054",
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
]
+
@pytest.fixture
def cdx_client():
client = CdxApiClient(
@@ -39,13 +159,13 @@ def cdx_client():
)
return client
+
@responses.activate
def test_cdx_fetch(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
@@ -58,16 +178,16 @@ def test_cdx_fetch(cdx_client):
assert resp.warc_offset == 108062304
assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
@responses.activate
def test_cdx_fetch_errors(cdx_client):
with pytest.raises(ValueError):
resp = cdx_client.fetch(CDX_TARGET, "2019")
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -77,14 +197,15 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
+ assert resp
+
@responses.activate
def test_cdx_lookup_best(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
@@ -95,6 +216,7 @@ def test_cdx_lookup_best(cdx_client):
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
WARC_TARGET = "http://fatcat.wiki/"
WARC_BODY = b"""
<html>
@@ -108,6 +230,7 @@ WARC_BODY = b"""
</html>
"""
+
@pytest.fixture
def wayback_client(cdx_client, mocker):
client = WaybackClient(
@@ -127,10 +250,11 @@ def wayback_client(cdx_client, mocker):
return client
+
@pytest.fixture
def wayback_client_pdf(cdx_client, mocker):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
client = WaybackClient(
@@ -150,6 +274,7 @@ def wayback_client_pdf(cdx_client, mocker):
return client
+
@responses.activate
def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
@@ -159,14 +284,14 @@ def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
assert resp == WARC_BODY
+
@responses.activate
def test_lookup_resource_success(wayback_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = wayback_client.lookup_resource(CDX_TARGET)
- assert resp.hit == True
+ assert resp.hit is True
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
index a996c56..786f863 100644
--- a/python/tests/test_xml.py
+++ b/python/tests/test_xml.py
@@ -1,12 +1,11 @@
-
import pytest
from sandcrawler.xml import xml_reserialize
def test_xml_reserialize() -> None:
-
- with open('tests/files/scielo_article.jats.xml', 'rb') as f:
+
+ with open("tests/files/scielo_article.jats.xml", "rb") as f:
raw_xml = f.read()
assert b'encoding="ISO-8859-1"' in raw_xml